1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
2
3 // Test new aarch64 intrinsics and types
4
5 #include <arm_neon.h>
6
7 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
8 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
10 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
11 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_lane_s16(int16x4_t a,int16x4_t b,int16x4_t v)12 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
13 return vmla_lane_s16(a, b, v, 3);
14 }
15
16 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
17 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
18 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
19 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
20 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_lane_s16(int16x8_t a,int16x8_t b,int16x4_t v)21 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
22 return vmlaq_lane_s16(a, b, v, 3);
23 }
24
25 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
26 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
27 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
28 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
29 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_lane_s32(int32x2_t a,int32x2_t b,int32x2_t v)30 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
31 return vmla_lane_s32(a, b, v, 1);
32 }
33
34 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
35 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
36 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
37 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
38 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_lane_s32(int32x4_t a,int32x4_t b,int32x2_t v)39 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
40 return vmlaq_lane_s32(a, b, v, 1);
41 }
42
43 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
44 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
45 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
46 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
47 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_laneq_s16(int16x4_t a,int16x4_t b,int16x8_t v)48 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
49 return vmla_laneq_s16(a, b, v, 7);
50 }
51
52 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
53 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
54 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
55 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
56 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_laneq_s16(int16x8_t a,int16x8_t b,int16x8_t v)57 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
58 return vmlaq_laneq_s16(a, b, v, 7);
59 }
60
61 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
62 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
63 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
64 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
65 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_laneq_s32(int32x2_t a,int32x2_t b,int32x4_t v)66 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
67 return vmla_laneq_s32(a, b, v, 3);
68 }
69
70 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
71 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
72 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
73 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
74 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_laneq_s32(int32x4_t a,int32x4_t b,int32x4_t v)75 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
76 return vmlaq_laneq_s32(a, b, v, 3);
77 }
78
79 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
80 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
81 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
82 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
83 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_lane_s16(int16x4_t a,int16x4_t b,int16x4_t v)84 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
85 return vmls_lane_s16(a, b, v, 3);
86 }
87
88 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
89 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
90 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
91 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
92 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_lane_s16(int16x8_t a,int16x8_t b,int16x4_t v)93 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
94 return vmlsq_lane_s16(a, b, v, 3);
95 }
96
97 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
98 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
99 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
100 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
101 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_lane_s32(int32x2_t a,int32x2_t b,int32x2_t v)102 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
103 return vmls_lane_s32(a, b, v, 1);
104 }
105
106 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
107 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
108 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
109 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
110 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_lane_s32(int32x4_t a,int32x4_t b,int32x2_t v)111 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
112 return vmlsq_lane_s32(a, b, v, 1);
113 }
114
115 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
116 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
117 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
118 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
119 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_laneq_s16(int16x4_t a,int16x4_t b,int16x8_t v)120 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
121 return vmls_laneq_s16(a, b, v, 7);
122 }
123
124 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
125 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
126 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
127 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
128 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_laneq_s16(int16x8_t a,int16x8_t b,int16x8_t v)129 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
130 return vmlsq_laneq_s16(a, b, v, 7);
131 }
132
133 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
134 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
135 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
136 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
137 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_laneq_s32(int32x2_t a,int32x2_t b,int32x4_t v)138 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
139 return vmls_laneq_s32(a, b, v, 3);
140 }
141
142 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
143 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
144 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
145 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
146 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_laneq_s32(int32x4_t a,int32x4_t b,int32x4_t v)147 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
148 return vmlsq_laneq_s32(a, b, v, 3);
149 }
150
151 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
152 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
153 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
154 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_lane_s16(int16x4_t a,int16x4_t v)155 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
156 return vmul_lane_s16(a, v, 3);
157 }
158
159 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
160 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
161 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
162 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_lane_s16(int16x8_t a,int16x4_t v)163 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
164 return vmulq_lane_s16(a, v, 3);
165 }
166
167 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
168 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
169 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
170 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_lane_s32(int32x2_t a,int32x2_t v)171 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
172 return vmul_lane_s32(a, v, 1);
173 }
174
175 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
176 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
177 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
178 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_lane_s32(int32x4_t a,int32x2_t v)179 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
180 return vmulq_lane_s32(a, v, 1);
181 }
182
183 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
184 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
185 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
186 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_lane_u16(uint16x4_t a,uint16x4_t v)187 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
188 return vmul_lane_u16(a, v, 3);
189 }
190
191 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
192 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
193 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
194 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_lane_u16(uint16x8_t a,uint16x4_t v)195 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
196 return vmulq_lane_u16(a, v, 3);
197 }
198
199 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
200 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
201 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
202 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_lane_u32(uint32x2_t a,uint32x2_t v)203 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
204 return vmul_lane_u32(a, v, 1);
205 }
206
207 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
208 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
209 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
210 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_lane_u32(uint32x4_t a,uint32x2_t v)211 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
212 return vmulq_lane_u32(a, v, 1);
213 }
214
215 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
216 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
217 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
218 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_laneq_s16(int16x4_t a,int16x8_t v)219 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
220 return vmul_laneq_s16(a, v, 7);
221 }
222
223 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
224 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
225 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
226 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_laneq_s16(int16x8_t a,int16x8_t v)227 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
228 return vmulq_laneq_s16(a, v, 7);
229 }
230
231 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
232 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
233 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
234 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_laneq_s32(int32x2_t a,int32x4_t v)235 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
236 return vmul_laneq_s32(a, v, 3);
237 }
238
239 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
240 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
241 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
242 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_laneq_s32(int32x4_t a,int32x4_t v)243 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
244 return vmulq_laneq_s32(a, v, 3);
245 }
246
247 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
248 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
249 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
250 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_laneq_u16(uint16x4_t a,uint16x8_t v)251 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
252 return vmul_laneq_u16(a, v, 7);
253 }
254
255 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
256 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
257 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
258 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_laneq_u16(uint16x8_t a,uint16x8_t v)259 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
260 return vmulq_laneq_u16(a, v, 7);
261 }
262
263 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
264 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
265 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
266 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_laneq_u32(uint32x2_t a,uint32x4_t v)267 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
268 return vmul_laneq_u32(a, v, 3);
269 }
270
271 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
272 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
273 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
274 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_laneq_u32(uint32x4_t a,uint32x4_t v)275 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
276 return vmulq_laneq_u32(a, v, 3);
277 }
278
279 // CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
280 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
281 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
282 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
283 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
284 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
285 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
286 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
287 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
288 // CHECK: ret <2 x float> [[FMLA2]]
test_vfma_lane_f32(float32x2_t a,float32x2_t b,float32x2_t v)289 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
290 return vfma_lane_f32(a, b, v, 1);
291 }
292
293 // CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
294 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
295 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
296 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
297 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
298 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
299 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
300 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
301 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
302 // CHECK: ret <4 x float> [[FMLA2]]
test_vfmaq_lane_f32(float32x4_t a,float32x4_t b,float32x2_t v)303 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
304 return vfmaq_lane_f32(a, b, v, 1);
305 }
306
307 // CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
308 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
309 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
310 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
311 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
312 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
313 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
314 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
315 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
316 // CHECK: ret <2 x float> [[TMP6]]
test_vfma_laneq_f32(float32x2_t a,float32x2_t b,float32x4_t v)317 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
318 return vfma_laneq_f32(a, b, v, 3);
319 }
320
321 // CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
322 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
323 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
324 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
325 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
326 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
327 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
328 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
329 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
330 // CHECK: ret <4 x float> [[TMP6]]
test_vfmaq_laneq_f32(float32x4_t a,float32x4_t b,float32x4_t v)331 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
332 return vfmaq_laneq_f32(a, b, v, 3);
333 }
334
335 // CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
336 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
337 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
338 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
339 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
340 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
341 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
342 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
343 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
344 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
345 // CHECK: ret <2 x float> [[FMLA2]]
test_vfms_lane_f32(float32x2_t a,float32x2_t b,float32x2_t v)346 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
347 return vfms_lane_f32(a, b, v, 1);
348 }
349
350 // CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
351 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
352 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
353 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
354 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
355 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
356 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
357 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
358 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
359 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
360 // CHECK: ret <4 x float> [[FMLA2]]
test_vfmsq_lane_f32(float32x4_t a,float32x4_t b,float32x2_t v)361 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
362 return vfmsq_lane_f32(a, b, v, 1);
363 }
364
365 // CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
366 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
367 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
368 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
369 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
370 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
371 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
372 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
373 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
374 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
375 // CHECK: ret <2 x float> [[TMP6]]
test_vfms_laneq_f32(float32x2_t a,float32x2_t b,float32x4_t v)376 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
377 return vfms_laneq_f32(a, b, v, 3);
378 }
379
380 // CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
381 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
382 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
383 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
384 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
385 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
386 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
387 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
388 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
389 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
390 // CHECK: ret <4 x float> [[TMP6]]
test_vfmsq_laneq_f32(float32x4_t a,float32x4_t b,float32x4_t v)391 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
392 return vfmsq_laneq_f32(a, b, v, 3);
393 }
394
395 // CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
396 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
397 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
398 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
399 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
400 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
401 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
402 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
403 // CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
404 // CHECK: ret <2 x double> [[FMLA2]]
test_vfmaq_lane_f64(float64x2_t a,float64x2_t b,float64x1_t v)405 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
406 return vfmaq_lane_f64(a, b, v, 0);
407 }
408
409 // CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
410 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
411 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
412 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
413 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
414 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
415 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
416 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
417 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
418 // CHECK: ret <2 x double> [[TMP6]]
test_vfmaq_laneq_f64(float64x2_t a,float64x2_t b,float64x2_t v)419 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
420 return vfmaq_laneq_f64(a, b, v, 1);
421 }
422
423 // CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
424 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
425 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
426 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
427 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
428 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
429 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
430 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
431 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
432 // CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
433 // CHECK: ret <2 x double> [[FMLA2]]
test_vfmsq_lane_f64(float64x2_t a,float64x2_t b,float64x1_t v)434 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
435 return vfmsq_lane_f64(a, b, v, 0);
436 }
437
438 // CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
439 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
440 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
441 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
442 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
443 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
444 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
445 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
446 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
447 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
448 // CHECK: ret <2 x double> [[TMP6]]
test_vfmsq_laneq_f64(float64x2_t a,float64x2_t b,float64x2_t v)449 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
450 return vfmsq_laneq_f64(a, b, v, 1);
451 }
452
453 // CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
454 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
455 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
456 // CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
457 // CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
458 // CHECK: ret float [[TMP2]]
test_vfmas_laneq_f32(float32_t a,float32_t b,float32x4_t v)459 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
460 return vfmas_laneq_f32(a, b, v, 3);
461 }
462
463 // CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 {
464 // CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b
465 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
466 // CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
467 // CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
468 // CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
469 // CHECK: ret double [[TMP2]]
test_vfmsd_lane_f64(float64_t a,float64_t b,float64x1_t v)470 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
471 return vfmsd_lane_f64(a, b, v, 0);
472 }
473
474 // CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
475 // CHECK: [[SUB:%.*]] = fsub float -0.000000e+00, %b
476 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
477 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
478 // CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
479 // CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
480 // CHECK: ret float [[TMP2]]
test_vfmss_laneq_f32(float32_t a,float32_t b,float32x4_t v)481 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
482 return vfmss_laneq_f32(a, b, v, 3);
483 }
484
485 // CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 {
486 // CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b
487 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
488 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
489 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
490 // CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
491 // CHECK: ret double [[TMP2]]
test_vfmsd_laneq_f64(float64_t a,float64_t b,float64x2_t v)492 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
493 return vfmsd_laneq_f64(a, b, v, 1);
494 }
495
496 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
497 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
498 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
499 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
500 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
501 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
502 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
503 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
504 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)505 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
506 return vmlal_lane_s16(a, b, v, 3);
507 }
508
509 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
510 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
511 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
512 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
513 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
514 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
515 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
516 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
517 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)518 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
519 return vmlal_lane_s32(a, b, v, 1);
520 }
521
522 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
523 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
524 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
525 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
526 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
527 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
528 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
529 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
530 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)531 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
532 return vmlal_laneq_s16(a, b, v, 7);
533 }
534
535 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
536 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
537 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
538 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
539 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
540 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
541 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
542 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
543 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)544 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
545 return vmlal_laneq_s32(a, b, v, 3);
546 }
547
548 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
549 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
550 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
551 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
552 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
553 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
554 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
555 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
556 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
557 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)558 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
559 return vmlal_high_lane_s16(a, b, v, 3);
560 }
561
562 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
563 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
564 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
565 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
566 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
567 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
568 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
569 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
570 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
571 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)572 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
573 return vmlal_high_lane_s32(a, b, v, 1);
574 }
575
576 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
577 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
578 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
579 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
580 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
581 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
582 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
583 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
584 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
585 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)586 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
587 return vmlal_high_laneq_s16(a, b, v, 7);
588 }
589
590 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
591 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
592 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
593 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
594 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
595 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
596 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
597 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
598 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
599 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)600 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
601 return vmlal_high_laneq_s32(a, b, v, 3);
602 }
603
604 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
605 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
606 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
607 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
608 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
609 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
610 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
611 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
612 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)613 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
614 return vmlsl_lane_s16(a, b, v, 3);
615 }
616
617 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
618 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
619 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
620 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
621 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
622 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
623 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
624 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
625 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)626 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
627 return vmlsl_lane_s32(a, b, v, 1);
628 }
629
630 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
631 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
632 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
633 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
634 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
635 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
636 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
637 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
638 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)639 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
640 return vmlsl_laneq_s16(a, b, v, 7);
641 }
642
643 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
644 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
645 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
646 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
647 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
648 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
649 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
650 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
651 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)652 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
653 return vmlsl_laneq_s32(a, b, v, 3);
654 }
655
656 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
657 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
658 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
659 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
660 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
661 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
662 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
663 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
664 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
665 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)666 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
667 return vmlsl_high_lane_s16(a, b, v, 3);
668 }
669
670 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
671 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
672 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
673 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
674 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
675 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
676 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
677 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
678 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
679 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)680 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
681 return vmlsl_high_lane_s32(a, b, v, 1);
682 }
683
684 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
685 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
686 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
687 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
688 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
689 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
690 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
691 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
692 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
693 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)694 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
695 return vmlsl_high_laneq_s16(a, b, v, 7);
696 }
697
698 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
699 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
700 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
701 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
702 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
703 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
704 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
705 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
706 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
707 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)708 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
709 return vmlsl_high_laneq_s32(a, b, v, 3);
710 }
711
712 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
713 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
714 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
715 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
716 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
717 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
718 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
719 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
720 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_lane_u16(int32x4_t a,int16x4_t b,int16x4_t v)721 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
722 return vmlal_lane_u16(a, b, v, 3);
723 }
724
725 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
726 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
727 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
728 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
729 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
730 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
731 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
732 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
733 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_lane_u32(int64x2_t a,int32x2_t b,int32x2_t v)734 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
735 return vmlal_lane_u32(a, b, v, 1);
736 }
737
738 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
739 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
740 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
741 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
742 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
743 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
744 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
745 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
746 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_laneq_u16(int32x4_t a,int16x4_t b,int16x8_t v)747 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
748 return vmlal_laneq_u16(a, b, v, 7);
749 }
750
751 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
752 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
753 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
754 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
755 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
756 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
757 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
758 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
759 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_laneq_u32(int64x2_t a,int32x2_t b,int32x4_t v)760 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
761 return vmlal_laneq_u32(a, b, v, 3);
762 }
763
764 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
765 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
766 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
767 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
768 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
769 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
770 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
771 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
772 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
773 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_lane_u16(int32x4_t a,int16x8_t b,int16x4_t v)774 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
775 return vmlal_high_lane_u16(a, b, v, 3);
776 }
777
778 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
779 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
780 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
781 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
782 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
783 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
784 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
785 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
786 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
787 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_lane_u32(int64x2_t a,int32x4_t b,int32x2_t v)788 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
789 return vmlal_high_lane_u32(a, b, v, 1);
790 }
791
792 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
793 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
794 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
795 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
796 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
797 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
798 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
799 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
800 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
801 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_laneq_u16(int32x4_t a,int16x8_t b,int16x8_t v)802 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
803 return vmlal_high_laneq_u16(a, b, v, 7);
804 }
805
806 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
807 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
808 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
809 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
810 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
811 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
812 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
813 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
814 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
815 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_laneq_u32(int64x2_t a,int32x4_t b,int32x4_t v)816 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
817 return vmlal_high_laneq_u32(a, b, v, 3);
818 }
819
820 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
821 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
822 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
823 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
824 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
825 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
826 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
827 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
828 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_lane_u16(int32x4_t a,int16x4_t b,int16x4_t v)829 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
830 return vmlsl_lane_u16(a, b, v, 3);
831 }
832
833 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
834 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
835 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
836 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
837 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
838 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
839 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
840 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
841 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_lane_u32(int64x2_t a,int32x2_t b,int32x2_t v)842 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
843 return vmlsl_lane_u32(a, b, v, 1);
844 }
845
846 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
847 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
848 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
849 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
850 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
851 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
852 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
853 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
854 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_laneq_u16(int32x4_t a,int16x4_t b,int16x8_t v)855 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
856 return vmlsl_laneq_u16(a, b, v, 7);
857 }
858
859 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
860 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
861 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
862 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
863 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
864 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
865 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
866 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
867 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_laneq_u32(int64x2_t a,int32x2_t b,int32x4_t v)868 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
869 return vmlsl_laneq_u32(a, b, v, 3);
870 }
871
872 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
873 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
874 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
875 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
876 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
877 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
878 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
879 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
880 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
881 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_lane_u16(int32x4_t a,int16x8_t b,int16x4_t v)882 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
883 return vmlsl_high_lane_u16(a, b, v, 3);
884 }
885
886 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
887 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
888 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
889 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
890 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
891 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
892 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
893 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
894 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
895 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_lane_u32(int64x2_t a,int32x4_t b,int32x2_t v)896 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
897 return vmlsl_high_lane_u32(a, b, v, 1);
898 }
899
900 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
901 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
902 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
903 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
904 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
905 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
906 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
907 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
908 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
909 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_laneq_u16(int32x4_t a,int16x8_t b,int16x8_t v)910 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
911 return vmlsl_high_laneq_u16(a, b, v, 7);
912 }
913
914 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
915 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
916 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
917 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
918 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
919 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
920 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
921 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
922 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
923 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_laneq_u32(int64x2_t a,int32x4_t b,int32x4_t v)924 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
925 return vmlsl_high_laneq_u32(a, b, v, 3);
926 }
927
928 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
929 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
930 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
931 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
932 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
933 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
934 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
935 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_lane_s16(int16x4_t a,int16x4_t v)936 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
937 return vmull_lane_s16(a, v, 3);
938 }
939
940 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
941 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
942 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
943 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
944 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
945 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
946 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
947 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_lane_s32(int32x2_t a,int32x2_t v)948 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
949 return vmull_lane_s32(a, v, 1);
950 }
951
952 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
953 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
954 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
955 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
956 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
957 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
958 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
959 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_lane_u16(uint16x4_t a,uint16x4_t v)960 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
961 return vmull_lane_u16(a, v, 3);
962 }
963
964 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
965 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
966 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
967 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
968 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
969 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
970 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
971 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_lane_u32(uint32x2_t a,uint32x2_t v)972 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
973 return vmull_lane_u32(a, v, 1);
974 }
975
976 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
977 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
978 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
979 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
980 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
981 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
982 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
983 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
984 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_lane_s16(int16x8_t a,int16x4_t v)985 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
986 return vmull_high_lane_s16(a, v, 3);
987 }
988
989 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
990 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
991 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
992 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
993 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
994 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
995 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
996 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
997 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_lane_s32(int32x4_t a,int32x2_t v)998 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
999 return vmull_high_lane_s32(a, v, 1);
1000 }
1001
1002 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
1003 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1004 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1005 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1006 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1007 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1008 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1009 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1010 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_lane_u16(uint16x8_t a,uint16x4_t v)1011 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
1012 return vmull_high_lane_u16(a, v, 3);
1013 }
1014
1015 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
1016 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1017 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1018 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1019 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1020 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1021 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1022 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1023 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_lane_u32(uint32x4_t a,uint32x2_t v)1024 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
1025 return vmull_high_lane_u32(a, v, 1);
1026 }
1027
1028 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
1029 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1030 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1031 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1032 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1033 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1034 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1035 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_laneq_s16(int16x4_t a,int16x8_t v)1036 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
1037 return vmull_laneq_s16(a, v, 7);
1038 }
1039
1040 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
1041 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1042 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1043 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1044 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1045 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1046 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1047 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_laneq_s32(int32x2_t a,int32x4_t v)1048 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
1049 return vmull_laneq_s32(a, v, 3);
1050 }
1051
1052 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
1053 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1054 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1055 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1056 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1057 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1058 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1059 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_laneq_u16(uint16x4_t a,uint16x8_t v)1060 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
1061 return vmull_laneq_u16(a, v, 7);
1062 }
1063
1064 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
1065 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1066 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1067 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1068 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1069 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1070 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1071 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_laneq_u32(uint32x2_t a,uint32x4_t v)1072 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
1073 return vmull_laneq_u32(a, v, 3);
1074 }
1075
1076 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
1077 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1078 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1079 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1080 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1081 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1082 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1083 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1084 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_laneq_s16(int16x8_t a,int16x8_t v)1085 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1086 return vmull_high_laneq_s16(a, v, 7);
1087 }
1088
1089 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
1090 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1091 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1092 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1093 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1094 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1095 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1096 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1097 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_laneq_s32(int32x4_t a,int32x4_t v)1098 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1099 return vmull_high_laneq_s32(a, v, 3);
1100 }
1101
1102 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
1103 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1104 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1105 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1106 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1107 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1108 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1109 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1110 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_laneq_u16(uint16x8_t a,uint16x8_t v)1111 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1112 return vmull_high_laneq_u16(a, v, 7);
1113 }
1114
1115 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
1116 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1117 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1118 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1119 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1120 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1121 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1122 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1123 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_laneq_u32(uint32x4_t a,uint32x4_t v)1124 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1125 return vmull_high_laneq_u32(a, v, 3);
1126 }
1127
1128 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1129 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1130 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1131 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1132 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1133 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1134 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1135 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1136 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1137 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1138 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)1139 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1140 return vqdmlal_lane_s16(a, b, v, 3);
1141 }
1142
1143 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1144 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1145 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1146 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1147 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1148 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1149 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1150 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1151 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1152 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1153 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)1154 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1155 return vqdmlal_lane_s32(a, b, v, 1);
1156 }
1157
1158 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1159 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1160 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1161 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1162 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1163 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1164 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1165 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1166 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1167 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1168 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1169 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)1170 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1171 return vqdmlal_high_lane_s16(a, b, v, 3);
1172 }
1173
1174 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1175 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1176 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1177 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1178 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1179 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1180 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1181 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1182 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1183 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1184 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1185 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)1186 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1187 return vqdmlal_high_lane_s32(a, b, v, 1);
1188 }
1189
1190 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1191 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1192 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1193 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1194 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1195 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1196 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1197 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1198 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1199 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1200 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_lane_s16(int32x4_t a,int16x4_t b,int16x4_t v)1201 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1202 return vqdmlsl_lane_s16(a, b, v, 3);
1203 }
1204
1205 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1206 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1207 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1208 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1209 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1210 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1211 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1212 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1213 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1214 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1215 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_lane_s32(int64x2_t a,int32x2_t b,int32x2_t v)1216 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1217 return vqdmlsl_lane_s32(a, b, v, 1);
1218 }
1219
1220 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1221 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1222 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1223 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1224 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1225 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1226 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1227 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1228 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1229 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1230 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1231 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_lane_s16(int32x4_t a,int16x8_t b,int16x4_t v)1232 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1233 return vqdmlsl_high_lane_s16(a, b, v, 3);
1234 }
1235
1236 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1237 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1238 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1239 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1240 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1241 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1242 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1243 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1244 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1245 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1246 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1247 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_lane_s32(int64x2_t a,int32x4_t b,int32x2_t v)1248 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1249 return vqdmlsl_high_lane_s32(a, b, v, 1);
1250 }
1251
1252 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
1253 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1254 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1255 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1256 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1257 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1258 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1259 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1260 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1261 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_lane_s16(int16x4_t a,int16x4_t v)1262 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1263 return vqdmull_lane_s16(a, v, 3);
1264 }
1265
1266 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
1267 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1268 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1269 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1270 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1271 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1272 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1273 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1274 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1275 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_lane_s32(int32x2_t a,int32x2_t v)1276 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1277 return vqdmull_lane_s32(a, v, 1);
1278 }
1279
1280 // CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
1281 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1282 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1283 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1284 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1285 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1286 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1287 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1288 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1289 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_laneq_s16(int16x4_t a,int16x8_t v)1290 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1291 return vqdmull_laneq_s16(a, v, 3);
1292 }
1293
1294 // CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
1295 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1296 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1297 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1298 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1299 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1300 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1301 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1302 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1303 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_laneq_s32(int32x2_t a,int32x4_t v)1304 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1305 return vqdmull_laneq_s32(a, v, 3);
1306 }
1307
1308 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
1309 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1310 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1311 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1312 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1313 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1314 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1315 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1316 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1317 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1318 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_high_lane_s16(int16x8_t a,int16x4_t v)1319 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1320 return vqdmull_high_lane_s16(a, v, 3);
1321 }
1322
1323 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
1324 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1325 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1326 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1327 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1328 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1329 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1330 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1331 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1332 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1333 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_high_lane_s32(int32x4_t a,int32x2_t v)1334 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1335 return vqdmull_high_lane_s32(a, v, 1);
1336 }
1337
1338 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
1339 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1340 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1341 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1342 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1343 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1344 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1345 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1346 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1347 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1348 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_high_laneq_s16(int16x8_t a,int16x8_t v)1349 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1350 return vqdmull_high_laneq_s16(a, v, 7);
1351 }
1352
1353 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
1354 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1355 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1356 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1357 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1358 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1359 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1360 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1361 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1362 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1363 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_high_laneq_s32(int32x4_t a,int32x4_t v)1364 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1365 return vqdmull_high_laneq_s32(a, v, 3);
1366 }
1367
1368 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
1369 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1370 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1371 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1372 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1373 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1374 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
1375 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
1376 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
1377 // CHECK: ret <4 x i16> [[TMP2]]
test_vqdmulh_lane_s16(int16x4_t a,int16x4_t v)1378 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1379 return vqdmulh_lane_s16(a, v, 3);
1380 }
1381
1382 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
1383 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1384 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1385 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1386 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1387 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1388 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
1389 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
1390 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
1391 // CHECK: ret <8 x i16> [[TMP2]]
test_vqdmulhq_lane_s16(int16x8_t a,int16x4_t v)1392 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1393 return vqdmulhq_lane_s16(a, v, 3);
1394 }
1395
1396 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
1397 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1398 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1399 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1400 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1401 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1402 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
1403 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
1404 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
1405 // CHECK: ret <2 x i32> [[TMP2]]
test_vqdmulh_lane_s32(int32x2_t a,int32x2_t v)1406 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1407 return vqdmulh_lane_s32(a, v, 1);
1408 }
1409
1410 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
1411 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1412 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1413 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1414 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1415 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1416 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
1417 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
1418 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
1419 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmulhq_lane_s32(int32x4_t a,int32x2_t v)1420 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1421 return vqdmulhq_lane_s32(a, v, 1);
1422 }
1423
1424 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
1425 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1426 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1427 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1428 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1429 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1430 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
1431 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
1432 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
1433 // CHECK: ret <4 x i16> [[TMP2]]
test_vqrdmulh_lane_s16(int16x4_t a,int16x4_t v)1434 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1435 return vqrdmulh_lane_s16(a, v, 3);
1436 }
1437
1438 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
1439 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1440 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1441 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1442 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1443 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1444 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
1445 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
1446 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
1447 // CHECK: ret <8 x i16> [[TMP2]]
test_vqrdmulhq_lane_s16(int16x8_t a,int16x4_t v)1448 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1449 return vqrdmulhq_lane_s16(a, v, 3);
1450 }
1451
1452 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
1453 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1454 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1455 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1456 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1457 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1458 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
1459 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
1460 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
1461 // CHECK: ret <2 x i32> [[TMP2]]
test_vqrdmulh_lane_s32(int32x2_t a,int32x2_t v)1462 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1463 return vqrdmulh_lane_s32(a, v, 1);
1464 }
1465
1466 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
1467 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1468 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1469 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1470 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1471 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1472 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
1473 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
1474 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
1475 // CHECK: ret <4 x i32> [[TMP2]]
test_vqrdmulhq_lane_s32(int32x4_t a,int32x2_t v)1476 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1477 return vqrdmulhq_lane_s32(a, v, 1);
1478 }
1479
1480 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
1481 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1482 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1483 // CHECK: ret <2 x float> [[MUL]]
test_vmul_lane_f32(float32x2_t a,float32x2_t v)1484 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1485 return vmul_lane_f32(a, v, 1);
1486 }
1487
1488
1489 // CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 {
1490 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1491 // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
1492 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1493 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1494 // CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1495 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1496 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1497 // CHECK: ret <1 x double> [[TMP5]]
test_vmul_lane_f64(float64x1_t a,float64x1_t v)1498 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1499 return vmul_lane_f64(a, v, 0);
1500 }
1501
1502
1503 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
1504 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1505 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1506 // CHECK: ret <4 x float> [[MUL]]
test_vmulq_lane_f32(float32x4_t a,float32x2_t v)1507 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1508 return vmulq_lane_f32(a, v, 1);
1509 }
1510
1511 // CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
1512 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1513 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1514 // CHECK: ret <2 x double> [[MUL]]
test_vmulq_lane_f64(float64x2_t a,float64x1_t v)1515 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1516 return vmulq_lane_f64(a, v, 0);
1517 }
1518
1519 // CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
1520 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1521 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1522 // CHECK: ret <2 x float> [[MUL]]
test_vmul_laneq_f32(float32x2_t a,float32x4_t v)1523 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1524 return vmul_laneq_f32(a, v, 3);
1525 }
1526
1527 // CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 {
1528 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1529 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
1530 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1531 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1532 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1533 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1534 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1535 // CHECK: ret <1 x double> [[TMP5]]
test_vmul_laneq_f64(float64x1_t a,float64x2_t v)1536 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1537 return vmul_laneq_f64(a, v, 1);
1538 }
1539
1540
1541 // CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
1542 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1543 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1544 // CHECK: ret <4 x float> [[MUL]]
test_vmulq_laneq_f32(float32x4_t a,float32x4_t v)1545 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1546 return vmulq_laneq_f32(a, v, 3);
1547 }
1548
1549 // CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
1550 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1551 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1552 // CHECK: ret <2 x double> [[MUL]]
test_vmulq_laneq_f64(float64x2_t a,float64x2_t v)1553 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1554 return vmulq_laneq_f64(a, v, 1);
1555 }
1556
1557 // CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
1558 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1559 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1560 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1561 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1562 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1563 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
1564 // CHECK: ret <2 x float> [[VMULX2_I]]
test_vmulx_lane_f32(float32x2_t a,float32x2_t v)1565 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1566 return vmulx_lane_f32(a, v, 1);
1567 }
1568
1569 // CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
1570 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1571 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1572 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1573 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1574 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1575 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
1576 // CHECK: ret <4 x float> [[VMULX2_I]]
test_vmulxq_lane_f32(float32x4_t a,float32x2_t v)1577 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1578 return vmulxq_lane_f32(a, v, 1);
1579 }
1580
1581 // CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
1582 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1583 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1584 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1585 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1586 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1587 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
1588 // CHECK: ret <2 x double> [[VMULX2_I]]
test_vmulxq_lane_f64(float64x2_t a,float64x1_t v)1589 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1590 return vmulxq_lane_f64(a, v, 0);
1591 }
1592
1593 // CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
1594 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1595 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1596 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1597 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1598 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1599 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
1600 // CHECK: ret <2 x float> [[VMULX2_I]]
test_vmulx_laneq_f32(float32x2_t a,float32x4_t v)1601 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1602 return vmulx_laneq_f32(a, v, 3);
1603 }
1604
1605 // CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
1606 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1607 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1608 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1609 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1610 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1611 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
1612 // CHECK: ret <4 x float> [[VMULX2_I]]
test_vmulxq_laneq_f32(float32x4_t a,float32x4_t v)1613 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1614 return vmulxq_laneq_f32(a, v, 3);
1615 }
1616
1617 // CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
1618 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1619 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1620 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1621 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1622 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1623 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
1624 // CHECK: ret <2 x double> [[VMULX2_I]]
test_vmulxq_laneq_f64(float64x2_t a,float64x2_t v)1625 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1626 return vmulxq_laneq_f64(a, v, 1);
1627 }
1628
1629 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1630 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1631 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1632 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1633 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_lane_s16_0(int16x4_t a,int16x4_t b,int16x4_t v)1634 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1635 return vmla_lane_s16(a, b, v, 0);
1636 }
1637
1638 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1639 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1640 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1641 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1642 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_lane_s16_0(int16x8_t a,int16x8_t b,int16x4_t v)1643 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1644 return vmlaq_lane_s16(a, b, v, 0);
1645 }
1646
1647 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1648 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1649 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1650 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1651 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_lane_s32_0(int32x2_t a,int32x2_t b,int32x2_t v)1652 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1653 return vmla_lane_s32(a, b, v, 0);
1654 }
1655
1656 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1657 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1658 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1659 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1660 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_lane_s32_0(int32x4_t a,int32x4_t b,int32x2_t v)1661 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1662 return vmlaq_lane_s32(a, b, v, 0);
1663 }
1664
1665 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
1666 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1667 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1668 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1669 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_laneq_s16_0(int16x4_t a,int16x4_t b,int16x8_t v)1670 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1671 return vmla_laneq_s16(a, b, v, 0);
1672 }
1673
1674 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
1675 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1676 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1677 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1678 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_laneq_s16_0(int16x8_t a,int16x8_t b,int16x8_t v)1679 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1680 return vmlaq_laneq_s16(a, b, v, 0);
1681 }
1682
1683 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
1684 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1685 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1686 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1687 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_laneq_s32_0(int32x2_t a,int32x2_t b,int32x4_t v)1688 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1689 return vmla_laneq_s32(a, b, v, 0);
1690 }
1691
1692 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
1693 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1694 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1695 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1696 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_laneq_s32_0(int32x4_t a,int32x4_t b,int32x4_t v)1697 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1698 return vmlaq_laneq_s32(a, b, v, 0);
1699 }
1700
1701 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1702 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1703 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1704 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1705 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_lane_s16_0(int16x4_t a,int16x4_t b,int16x4_t v)1706 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1707 return vmls_lane_s16(a, b, v, 0);
1708 }
1709
1710 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1711 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1712 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1713 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1714 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_lane_s16_0(int16x8_t a,int16x8_t b,int16x4_t v)1715 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1716 return vmlsq_lane_s16(a, b, v, 0);
1717 }
1718
1719 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1720 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1721 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1722 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1723 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_lane_s32_0(int32x2_t a,int32x2_t b,int32x2_t v)1724 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1725 return vmls_lane_s32(a, b, v, 0);
1726 }
1727
1728 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1729 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1730 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1731 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1732 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_lane_s32_0(int32x4_t a,int32x4_t b,int32x2_t v)1733 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1734 return vmlsq_lane_s32(a, b, v, 0);
1735 }
1736
1737 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
1738 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1739 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1740 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1741 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_laneq_s16_0(int16x4_t a,int16x4_t b,int16x8_t v)1742 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1743 return vmls_laneq_s16(a, b, v, 0);
1744 }
1745
1746 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
1747 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1748 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1749 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1750 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_laneq_s16_0(int16x8_t a,int16x8_t b,int16x8_t v)1751 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1752 return vmlsq_laneq_s16(a, b, v, 0);
1753 }
1754
1755 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
1756 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1757 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1758 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1759 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_laneq_s32_0(int32x2_t a,int32x2_t b,int32x4_t v)1760 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1761 return vmls_laneq_s32(a, b, v, 0);
1762 }
1763
1764 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
1765 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1766 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1767 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1768 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_laneq_s32_0(int32x4_t a,int32x4_t b,int32x4_t v)1769 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1770 return vmlsq_laneq_s32(a, b, v, 0);
1771 }
1772
1773 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
1774 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1775 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1776 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_lane_s16_0(int16x4_t a,int16x4_t v)1777 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
1778 return vmul_lane_s16(a, v, 0);
1779 }
1780
1781 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
1782 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1783 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1784 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_lane_s16_0(int16x8_t a,int16x4_t v)1785 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
1786 return vmulq_lane_s16(a, v, 0);
1787 }
1788
1789 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
1790 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1791 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1792 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_lane_s32_0(int32x2_t a,int32x2_t v)1793 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
1794 return vmul_lane_s32(a, v, 0);
1795 }
1796
1797 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
1798 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1799 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1800 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_lane_s32_0(int32x4_t a,int32x2_t v)1801 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
1802 return vmulq_lane_s32(a, v, 0);
1803 }
1804
1805 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
1806 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1807 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1808 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_lane_u16_0(uint16x4_t a,uint16x4_t v)1809 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
1810 return vmul_lane_u16(a, v, 0);
1811 }
1812
1813 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
1814 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1815 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1816 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_lane_u16_0(uint16x8_t a,uint16x4_t v)1817 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
1818 return vmulq_lane_u16(a, v, 0);
1819 }
1820
1821 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
1822 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1823 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1824 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_lane_u32_0(uint32x2_t a,uint32x2_t v)1825 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
1826 return vmul_lane_u32(a, v, 0);
1827 }
1828
1829 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
1830 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1831 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1832 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_lane_u32_0(uint32x4_t a,uint32x2_t v)1833 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
1834 return vmulq_lane_u32(a, v, 0);
1835 }
1836
1837 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
1838 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1839 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1840 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_laneq_s16_0(int16x4_t a,int16x8_t v)1841 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
1842 return vmul_laneq_s16(a, v, 0);
1843 }
1844
1845 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
1846 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1847 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1848 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_laneq_s16_0(int16x8_t a,int16x8_t v)1849 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
1850 return vmulq_laneq_s16(a, v, 0);
1851 }
1852
1853 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
1854 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1855 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1856 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_laneq_s32_0(int32x2_t a,int32x4_t v)1857 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
1858 return vmul_laneq_s32(a, v, 0);
1859 }
1860
1861 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
1862 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1863 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1864 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_laneq_s32_0(int32x4_t a,int32x4_t v)1865 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
1866 return vmulq_laneq_s32(a, v, 0);
1867 }
1868
1869 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
1870 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1871 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1872 // CHECK: ret <4 x i16> [[MUL]]
test_vmul_laneq_u16_0(uint16x4_t a,uint16x8_t v)1873 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
1874 return vmul_laneq_u16(a, v, 0);
1875 }
1876
1877 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
1878 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1879 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1880 // CHECK: ret <8 x i16> [[MUL]]
test_vmulq_laneq_u16_0(uint16x8_t a,uint16x8_t v)1881 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
1882 return vmulq_laneq_u16(a, v, 0);
1883 }
1884
1885 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
1886 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1887 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1888 // CHECK: ret <2 x i32> [[MUL]]
test_vmul_laneq_u32_0(uint32x2_t a,uint32x4_t v)1889 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
1890 return vmul_laneq_u32(a, v, 0);
1891 }
1892
1893 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
1894 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1895 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1896 // CHECK: ret <4 x i32> [[MUL]]
test_vmulq_laneq_u32_0(uint32x4_t a,uint32x4_t v)1897 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
1898 return vmulq_laneq_u32(a, v, 0);
1899 }
1900
1901 // CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
1902 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1903 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1904 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1905 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1906 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1907 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1908 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1909 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1910 // CHECK: ret <2 x float> [[FMLA2]]
test_vfma_lane_f32_0(float32x2_t a,float32x2_t b,float32x2_t v)1911 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1912 return vfma_lane_f32(a, b, v, 0);
1913 }
1914
1915 // CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
1916 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1917 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1918 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1919 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1920 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1921 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1922 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1923 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1924 // CHECK: ret <4 x float> [[FMLA2]]
test_vfmaq_lane_f32_0(float32x4_t a,float32x4_t b,float32x2_t v)1925 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1926 return vfmaq_lane_f32(a, b, v, 0);
1927 }
1928
1929 // CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
1930 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1931 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1932 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1933 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1934 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1935 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1936 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1937 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1938 // CHECK: ret <2 x float> [[TMP6]]
test_vfma_laneq_f32_0(float32x2_t a,float32x2_t b,float32x4_t v)1939 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1940 return vfma_laneq_f32(a, b, v, 0);
1941 }
1942
1943 // CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
1944 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1945 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1946 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1947 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1948 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1949 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1950 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1951 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1952 // CHECK: ret <4 x float> [[TMP6]]
test_vfmaq_laneq_f32_0(float32x4_t a,float32x4_t b,float32x4_t v)1953 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1954 return vfmaq_laneq_f32(a, b, v, 0);
1955 }
1956
1957 // CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
1958 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1959 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1960 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1961 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1962 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1963 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1964 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1965 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1966 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1967 // CHECK: ret <2 x float> [[FMLA2]]
test_vfms_lane_f32_0(float32x2_t a,float32x2_t b,float32x2_t v)1968 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1969 return vfms_lane_f32(a, b, v, 0);
1970 }
1971
1972 // CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
1973 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1974 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1975 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1976 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1977 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1978 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1979 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1980 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1981 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1982 // CHECK: ret <4 x float> [[FMLA2]]
test_vfmsq_lane_f32_0(float32x4_t a,float32x4_t b,float32x2_t v)1983 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1984 return vfmsq_lane_f32(a, b, v, 0);
1985 }
1986
1987 // CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
1988 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1989 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1990 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1991 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1992 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1993 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1994 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1995 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1996 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1997 // CHECK: ret <2 x float> [[TMP6]]
test_vfms_laneq_f32_0(float32x2_t a,float32x2_t b,float32x4_t v)1998 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1999 return vfms_laneq_f32(a, b, v, 0);
2000 }
2001
2002 // CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
2003 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
2004 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2005 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
2006 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
2007 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2008 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2009 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2010 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2011 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2012 // CHECK: ret <4 x float> [[TMP6]]
test_vfmsq_laneq_f32_0(float32x4_t a,float32x4_t b,float32x4_t v)2013 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2014 return vfmsq_laneq_f32(a, b, v, 0);
2015 }
2016
2017 // CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
2018 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2019 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
2020 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
2021 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2022 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2023 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2024 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2025 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2026 // CHECK: ret <2 x double> [[TMP6]]
test_vfmaq_laneq_f64_0(float64x2_t a,float64x2_t b,float64x2_t v)2027 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2028 return vfmaq_laneq_f64(a, b, v, 0);
2029 }
2030
2031 // CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
2032 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
2033 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2034 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
2035 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
2036 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2037 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2038 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2039 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2040 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2041 // CHECK: ret <2 x double> [[TMP6]]
test_vfmsq_laneq_f64_0(float64x2_t a,float64x2_t b,float64x2_t v)2042 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2043 return vfmsq_laneq_f64(a, b, v, 0);
2044 }
2045
2046 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2047 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2048 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2049 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2050 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2051 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2052 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2053 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2054 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)2055 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2056 return vmlal_lane_s16(a, b, v, 0);
2057 }
2058
2059 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2060 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2061 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2062 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2063 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2064 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2065 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2066 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2067 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)2068 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2069 return vmlal_lane_s32(a, b, v, 0);
2070 }
2071
2072 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2073 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2074 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2075 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2076 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2077 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2078 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2079 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2080 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)2081 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2082 return vmlal_laneq_s16(a, b, v, 0);
2083 }
2084
2085 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2086 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2087 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2088 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2089 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2090 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2091 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2092 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2093 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)2094 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2095 return vmlal_laneq_s32(a, b, v, 0);
2096 }
2097
2098 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2099 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2100 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2101 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2102 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2103 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2104 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2105 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2106 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2107 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)2108 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2109 return vmlal_high_lane_s16(a, b, v, 0);
2110 }
2111
2112 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2113 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2114 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2115 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2116 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2117 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2118 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2119 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2120 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2121 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)2122 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2123 return vmlal_high_lane_s32(a, b, v, 0);
2124 }
2125
2126 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2127 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2128 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2129 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2130 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2131 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2132 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2133 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2134 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2135 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)2136 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2137 return vmlal_high_laneq_s16(a, b, v, 0);
2138 }
2139
2140 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2141 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2142 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2143 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2144 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2145 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2146 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2147 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2148 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2149 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)2150 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2151 return vmlal_high_laneq_s32(a, b, v, 0);
2152 }
2153
2154 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2155 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2156 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2157 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2158 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2159 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2160 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2161 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2162 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)2163 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2164 return vmlsl_lane_s16(a, b, v, 0);
2165 }
2166
2167 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2168 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2169 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2170 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2171 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2172 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2173 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2174 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2175 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)2176 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2177 return vmlsl_lane_s32(a, b, v, 0);
2178 }
2179
2180 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2181 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2182 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2183 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2184 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2185 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2186 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2187 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2188 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)2189 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2190 return vmlsl_laneq_s16(a, b, v, 0);
2191 }
2192
2193 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2194 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2195 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2196 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2197 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2198 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2199 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2200 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2201 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)2202 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2203 return vmlsl_laneq_s32(a, b, v, 0);
2204 }
2205
2206 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2207 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2208 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2209 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2210 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2211 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2212 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2213 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2214 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2215 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)2216 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2217 return vmlsl_high_lane_s16(a, b, v, 0);
2218 }
2219
2220 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2221 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2222 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2223 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2224 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2225 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2226 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2227 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2228 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2229 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)2230 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2231 return vmlsl_high_lane_s32(a, b, v, 0);
2232 }
2233
2234 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2235 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2236 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2237 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2238 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2239 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2240 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2241 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2242 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2243 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)2244 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2245 return vmlsl_high_laneq_s16(a, b, v, 0);
2246 }
2247
2248 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2249 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2250 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2251 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2252 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2253 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2254 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2255 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2256 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2257 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)2258 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2259 return vmlsl_high_laneq_s32(a, b, v, 0);
2260 }
2261
2262 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2263 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2264 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2265 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2266 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2267 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2268 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2269 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2270 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_lane_u16_0(int32x4_t a,int16x4_t b,int16x4_t v)2271 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2272 return vmlal_lane_u16(a, b, v, 0);
2273 }
2274
2275 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2276 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2277 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2278 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2279 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2280 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2281 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2282 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2283 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_lane_u32_0(int64x2_t a,int32x2_t b,int32x2_t v)2284 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2285 return vmlal_lane_u32(a, b, v, 0);
2286 }
2287
2288 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2289 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2290 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2291 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2292 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2293 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2294 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2295 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2296 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_laneq_u16_0(int32x4_t a,int16x4_t b,int16x8_t v)2297 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2298 return vmlal_laneq_u16(a, b, v, 0);
2299 }
2300
2301 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2302 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2303 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2304 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2305 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2306 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2307 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2308 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2309 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_laneq_u32_0(int64x2_t a,int32x2_t b,int32x4_t v)2310 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2311 return vmlal_laneq_u32(a, b, v, 0);
2312 }
2313
2314 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2315 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2316 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2317 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2318 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2319 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2320 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2321 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2322 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2323 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_lane_u16_0(int32x4_t a,int16x8_t b,int16x4_t v)2324 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2325 return vmlal_high_lane_u16(a, b, v, 0);
2326 }
2327
2328 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2329 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2330 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2331 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2332 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2333 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2334 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2335 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2336 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2337 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_lane_u32_0(int64x2_t a,int32x4_t b,int32x2_t v)2338 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2339 return vmlal_high_lane_u32(a, b, v, 0);
2340 }
2341
2342 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2343 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2344 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2345 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2346 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2347 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2348 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2349 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2350 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2351 // CHECK: ret <4 x i32> [[ADD]]
test_vmlal_high_laneq_u16_0(int32x4_t a,int16x8_t b,int16x8_t v)2352 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2353 return vmlal_high_laneq_u16(a, b, v, 0);
2354 }
2355
2356 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2357 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2358 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2359 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2360 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2361 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2362 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2363 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2364 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2365 // CHECK: ret <2 x i64> [[ADD]]
test_vmlal_high_laneq_u32_0(int64x2_t a,int32x4_t b,int32x4_t v)2366 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2367 return vmlal_high_laneq_u32(a, b, v, 0);
2368 }
2369
2370 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2371 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2372 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2373 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2374 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2375 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2376 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2377 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2378 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_lane_u16_0(int32x4_t a,int16x4_t b,int16x4_t v)2379 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2380 return vmlsl_lane_u16(a, b, v, 0);
2381 }
2382
2383 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2384 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2385 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2386 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2387 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2388 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2389 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2390 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2391 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_lane_u32_0(int64x2_t a,int32x2_t b,int32x2_t v)2392 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2393 return vmlsl_lane_u32(a, b, v, 0);
2394 }
2395
2396 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2397 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2398 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2399 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2400 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2401 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2402 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2403 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2404 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_laneq_u16_0(int32x4_t a,int16x4_t b,int16x8_t v)2405 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2406 return vmlsl_laneq_u16(a, b, v, 0);
2407 }
2408
2409 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2410 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2411 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2412 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2413 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2414 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2415 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2416 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2417 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_laneq_u32_0(int64x2_t a,int32x2_t b,int32x4_t v)2418 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2419 return vmlsl_laneq_u32(a, b, v, 0);
2420 }
2421
2422 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2423 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2424 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2425 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2426 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2427 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2428 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2429 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2430 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2431 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_lane_u16_0(int32x4_t a,int16x8_t b,int16x4_t v)2432 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2433 return vmlsl_high_lane_u16(a, b, v, 0);
2434 }
2435
2436 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2437 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2438 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2439 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2440 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2441 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2442 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2443 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2444 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2445 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_lane_u32_0(int64x2_t a,int32x4_t b,int32x2_t v)2446 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2447 return vmlsl_high_lane_u32(a, b, v, 0);
2448 }
2449
2450 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2451 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2452 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2453 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2454 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2455 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2456 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2457 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2458 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2459 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsl_high_laneq_u16_0(int32x4_t a,int16x8_t b,int16x8_t v)2460 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2461 return vmlsl_high_laneq_u16(a, b, v, 0);
2462 }
2463
2464 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2465 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2466 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2467 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2468 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2469 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2470 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2471 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2472 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2473 // CHECK: ret <2 x i64> [[SUB]]
test_vmlsl_high_laneq_u32_0(int64x2_t a,int32x4_t b,int32x4_t v)2474 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2475 return vmlsl_high_laneq_u32(a, b, v, 0);
2476 }
2477
2478 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2479 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2480 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2481 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2482 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2483 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2484 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2485 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_lane_s16_0(int16x4_t a,int16x4_t v)2486 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2487 return vmull_lane_s16(a, v, 0);
2488 }
2489
2490 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2491 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2492 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2493 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2494 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2495 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2496 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2497 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_lane_s32_0(int32x2_t a,int32x2_t v)2498 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2499 return vmull_lane_s32(a, v, 0);
2500 }
2501
2502 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2503 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2504 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2505 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2506 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2507 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2508 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2509 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_lane_u16_0(uint16x4_t a,uint16x4_t v)2510 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2511 return vmull_lane_u16(a, v, 0);
2512 }
2513
2514 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2515 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2516 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2517 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2518 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2519 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2520 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2521 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_lane_u32_0(uint32x2_t a,uint32x2_t v)2522 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2523 return vmull_lane_u32(a, v, 0);
2524 }
2525
2526 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2527 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2528 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2529 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2530 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2531 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2532 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2533 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2534 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_lane_s16_0(int16x8_t a,int16x4_t v)2535 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2536 return vmull_high_lane_s16(a, v, 0);
2537 }
2538
2539 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2540 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2541 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2542 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2543 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2544 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2545 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2546 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2547 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_lane_s32_0(int32x4_t a,int32x2_t v)2548 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2549 return vmull_high_lane_s32(a, v, 0);
2550 }
2551
2552 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2553 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2554 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2555 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2556 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2557 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2558 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2559 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2560 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_lane_u16_0(uint16x8_t a,uint16x4_t v)2561 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2562 return vmull_high_lane_u16(a, v, 0);
2563 }
2564
2565 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2566 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2567 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2568 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2569 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2570 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2571 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2572 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2573 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_lane_u32_0(uint32x4_t a,uint32x2_t v)2574 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2575 return vmull_high_lane_u32(a, v, 0);
2576 }
2577
2578 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
2579 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2580 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2581 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2582 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2583 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2584 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2585 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_laneq_s16_0(int16x4_t a,int16x8_t v)2586 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2587 return vmull_laneq_s16(a, v, 0);
2588 }
2589
2590 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
2591 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2592 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2593 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2594 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2595 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2596 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2597 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_laneq_s32_0(int32x2_t a,int32x4_t v)2598 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2599 return vmull_laneq_s32(a, v, 0);
2600 }
2601
2602 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
2603 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2604 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2605 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2606 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2607 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2608 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2609 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_laneq_u16_0(uint16x4_t a,uint16x8_t v)2610 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2611 return vmull_laneq_u16(a, v, 0);
2612 }
2613
2614 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
2615 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2616 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2617 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2618 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2619 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2620 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2621 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_laneq_u32_0(uint32x2_t a,uint32x4_t v)2622 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2623 return vmull_laneq_u32(a, v, 0);
2624 }
2625
2626 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
2627 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2628 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2629 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2630 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2631 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2632 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2633 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2634 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_laneq_s16_0(int16x8_t a,int16x8_t v)2635 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2636 return vmull_high_laneq_s16(a, v, 0);
2637 }
2638
2639 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
2640 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2641 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2642 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2643 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2644 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2645 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2646 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2647 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_laneq_s32_0(int32x4_t a,int32x4_t v)2648 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2649 return vmull_high_laneq_s32(a, v, 0);
2650 }
2651
2652 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
2653 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2654 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2655 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2656 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2657 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2658 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2659 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2660 // CHECK: ret <4 x i32> [[VMULL2_I]]
test_vmull_high_laneq_u16_0(uint16x8_t a,uint16x8_t v)2661 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2662 return vmull_high_laneq_u16(a, v, 0);
2663 }
2664
2665 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
2666 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2667 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2668 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2669 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2670 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2671 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2672 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2673 // CHECK: ret <2 x i64> [[VMULL2_I]]
test_vmull_high_laneq_u32_0(uint32x4_t a,uint32x4_t v)2674 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2675 return vmull_high_laneq_u32(a, v, 0);
2676 }
2677
2678 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2679 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2680 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2681 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2682 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2683 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2684 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2685 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2686 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2687 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2688 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)2689 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2690 return vqdmlal_lane_s16(a, b, v, 0);
2691 }
2692
2693 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2694 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2695 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2696 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2697 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2698 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2699 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2700 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2701 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2702 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2703 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)2704 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2705 return vqdmlal_lane_s32(a, b, v, 0);
2706 }
2707
2708 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2709 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2710 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2711 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2712 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2713 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2714 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2715 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2716 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2717 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2718 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2719 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)2720 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2721 return vqdmlal_high_lane_s16(a, b, v, 0);
2722 }
2723
2724 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2725 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2726 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2727 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2728 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2729 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2730 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2731 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2732 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2733 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2734 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2735 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)2736 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2737 return vqdmlal_high_lane_s32(a, b, v, 0);
2738 }
2739
2740 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2741 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2742 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2743 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2744 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2745 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2746 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2747 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2748 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2749 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2750 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_lane_s16_0(int32x4_t a,int16x4_t b,int16x4_t v)2751 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2752 return vqdmlsl_lane_s16(a, b, v, 0);
2753 }
2754
2755 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2756 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2757 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2758 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2759 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2760 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2761 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2762 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2763 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2764 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2765 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_lane_s32_0(int64x2_t a,int32x2_t b,int32x2_t v)2766 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2767 return vqdmlsl_lane_s32(a, b, v, 0);
2768 }
2769
2770 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2771 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2772 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2773 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2774 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2775 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2776 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2777 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2778 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2779 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2780 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2781 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_lane_s16_0(int32x4_t a,int16x8_t b,int16x4_t v)2782 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2783 return vqdmlsl_high_lane_s16(a, b, v, 0);
2784 }
2785
2786 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2787 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2788 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2789 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2790 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2791 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2792 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2793 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2794 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2795 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2796 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2797 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_lane_s32_0(int64x2_t a,int32x4_t b,int32x2_t v)2798 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2799 return vqdmlsl_high_lane_s32(a, b, v, 0);
2800 }
2801
2802 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2803 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2804 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2805 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2806 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2807 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2808 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2809 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2810 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2811 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_lane_s16_0(int16x4_t a,int16x4_t v)2812 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2813 return vqdmull_lane_s16(a, v, 0);
2814 }
2815
2816 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2817 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2818 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2819 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2820 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2821 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2822 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2823 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2824 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2825 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_lane_s32_0(int32x2_t a,int32x2_t v)2826 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2827 return vqdmull_lane_s32(a, v, 0);
2828 }
2829
2830 // CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
2831 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2832 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2833 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2834 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2835 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2836 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2837 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2838 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2839 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_laneq_s16_0(int16x4_t a,int16x8_t v)2840 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2841 return vqdmull_laneq_s16(a, v, 0);
2842 }
2843
2844 // CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
2845 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2846 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2847 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2848 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2849 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2850 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2851 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2852 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2853 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_laneq_s32_0(int32x2_t a,int32x4_t v)2854 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2855 return vqdmull_laneq_s32(a, v, 0);
2856 }
2857
2858 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2859 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2860 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2861 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2862 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2863 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2864 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2865 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2866 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2867 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2868 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_high_lane_s16_0(int16x8_t a,int16x4_t v)2869 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2870 return vqdmull_high_lane_s16(a, v, 0);
2871 }
2872
2873 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2874 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2875 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2876 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2877 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2878 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2879 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2880 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2881 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2882 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2883 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_high_lane_s32_0(int32x4_t a,int32x2_t v)2884 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2885 return vqdmull_high_lane_s32(a, v, 0);
2886 }
2887
2888 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
2889 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2890 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2891 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2892 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2893 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2894 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2895 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2896 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2897 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2898 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_high_laneq_s16_0(int16x8_t a,int16x8_t v)2899 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2900 return vqdmull_high_laneq_s16(a, v, 0);
2901 }
2902
2903 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
2904 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2905 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2906 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2907 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2908 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2909 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2910 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2911 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2912 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2913 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_high_laneq_s32_0(int32x4_t a,int32x4_t v)2914 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2915 return vqdmull_high_laneq_s32(a, v, 0);
2916 }
2917
2918 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2919 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2920 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2921 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2922 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2923 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2924 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
2925 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
2926 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
2927 // CHECK: ret <4 x i16> [[TMP2]]
test_vqdmulh_lane_s16_0(int16x4_t a,int16x4_t v)2928 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2929 return vqdmulh_lane_s16(a, v, 0);
2930 }
2931
2932 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2933 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2934 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2935 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2936 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2937 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2938 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
2939 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
2940 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
2941 // CHECK: ret <8 x i16> [[TMP2]]
test_vqdmulhq_lane_s16_0(int16x8_t a,int16x4_t v)2942 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2943 return vqdmulhq_lane_s16(a, v, 0);
2944 }
2945
2946 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2947 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2948 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2949 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2950 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2951 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2952 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
2953 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
2954 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
2955 // CHECK: ret <2 x i32> [[TMP2]]
test_vqdmulh_lane_s32_0(int32x2_t a,int32x2_t v)2956 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2957 return vqdmulh_lane_s32(a, v, 0);
2958 }
2959
2960 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2961 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2962 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2963 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2964 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2965 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
2966 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
2967 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
2968 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
2969 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmulhq_lane_s32_0(int32x4_t a,int32x2_t v)2970 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2971 return vqdmulhq_lane_s32(a, v, 0);
2972 }
2973
2974 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2975 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2976 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2977 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2978 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2979 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2980 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
2981 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
2982 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
2983 // CHECK: ret <4 x i16> [[TMP2]]
test_vqrdmulh_lane_s16_0(int16x4_t a,int16x4_t v)2984 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2985 return vqrdmulh_lane_s16(a, v, 0);
2986 }
2987
2988 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2989 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2990 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2991 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2992 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2993 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2994 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
2995 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
2996 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
2997 // CHECK: ret <8 x i16> [[TMP2]]
test_vqrdmulhq_lane_s16_0(int16x8_t a,int16x4_t v)2998 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2999 return vqrdmulhq_lane_s16(a, v, 0);
3000 }
3001
3002 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
3003 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3004 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3005 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3006 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3007 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3008 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
3009 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
3010 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
3011 // CHECK: ret <2 x i32> [[TMP2]]
test_vqrdmulh_lane_s32_0(int32x2_t a,int32x2_t v)3012 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3013 return vqrdmulh_lane_s32(a, v, 0);
3014 }
3015
3016 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
3017 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3018 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3019 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
3020 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3021 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3022 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
3023 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
3024 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
3025 // CHECK: ret <4 x i32> [[TMP2]]
test_vqrdmulhq_lane_s32_0(int32x4_t a,int32x2_t v)3026 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3027 return vqrdmulhq_lane_s32(a, v, 0);
3028 }
3029
3030 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
3031 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
3032 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
3033 // CHECK: ret <2 x float> [[MUL]]
test_vmul_lane_f32_0(float32x2_t a,float32x2_t v)3034 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
3035 return vmul_lane_f32(a, v, 0);
3036 }
3037
3038 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
3039 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
3040 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
3041 // CHECK: ret <4 x float> [[MUL]]
test_vmulq_lane_f32_0(float32x4_t a,float32x2_t v)3042 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
3043 return vmulq_lane_f32(a, v, 0);
3044 }
3045
3046 // CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
3047 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
3048 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
3049 // CHECK: ret <2 x float> [[MUL]]
test_vmul_laneq_f32_0(float32x2_t a,float32x4_t v)3050 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
3051 return vmul_laneq_f32(a, v, 0);
3052 }
3053
3054 // CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 {
3055 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3056 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
3057 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
3058 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3059 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
3060 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
3061 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
3062 // CHECK: ret <1 x double> [[TMP5]]
test_vmul_laneq_f64_0(float64x1_t a,float64x2_t v)3063 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
3064 return vmul_laneq_f64(a, v, 0);
3065 }
3066
3067 // CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
3068 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
3069 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
3070 // CHECK: ret <4 x float> [[MUL]]
test_vmulq_laneq_f32_0(float32x4_t a,float32x4_t v)3071 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3072 return vmulq_laneq_f32(a, v, 0);
3073 }
3074
3075 // CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
3076 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
3077 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
3078 // CHECK: ret <2 x double> [[MUL]]
test_vmulq_laneq_f64_0(float64x2_t a,float64x2_t v)3079 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3080 return vmulq_laneq_f64(a, v, 0);
3081 }
3082
3083 // CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
3084 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
3085 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3086 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
3087 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3088 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3089 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
3090 // CHECK: ret <2 x float> [[VMULX2_I]]
test_vmulx_lane_f32_0(float32x2_t a,float32x2_t v)3091 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
3092 return vmulx_lane_f32(a, v, 0);
3093 }
3094
3095 // CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
3096 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
3097 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3098 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
3099 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3100 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3101 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
3102 // CHECK: ret <4 x float> [[VMULX2_I]]
test_vmulxq_lane_f32_0(float32x4_t a,float32x2_t v)3103 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
3104 return vmulxq_lane_f32(a, v, 0);
3105 }
3106
3107 // CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 {
3108 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
3109 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
3110 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
3111 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3112 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3113 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
3114 // CHECK: ret <2 x double> [[VMULX2_I]]
test_vmulxq_lane_f64_0(float64x2_t a,float64x1_t v)3115 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
3116 return vmulxq_lane_f64(a, v, 0);
3117 }
3118
3119 // CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
3120 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
3121 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3122 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
3123 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3124 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3125 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
3126 // CHECK: ret <2 x float> [[VMULX2_I]]
test_vmulx_laneq_f32_0(float32x2_t a,float32x4_t v)3127 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
3128 return vmulx_laneq_f32(a, v, 0);
3129 }
3130
3131 // CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
3132 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
3133 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3134 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
3135 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3136 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3137 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
3138 // CHECK: ret <4 x float> [[VMULX2_I]]
test_vmulxq_laneq_f32_0(float32x4_t a,float32x4_t v)3139 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3140 return vmulxq_laneq_f32(a, v, 0);
3141 }
3142
3143 // CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
3144 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
3145 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
3146 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
3147 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3148 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3149 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
3150 // CHECK: ret <2 x double> [[VMULX2_I]]
test_vmulxq_laneq_f64_0(float64x2_t a,float64x2_t v)3151 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3152 return vmulxq_laneq_f64(a, v, 0);
3153 }
3154
3155 // CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
3156 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3157 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3158 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3159 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
3160 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
3161 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
3162 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3163 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3164 // CHECK: [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3165 // CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
3166 // CHECK: ret <4 x i32> [[VMULL5_I_I]]
test_vmull_high_n_s16(int16x8_t a,int16_t b)3167 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
3168 return vmull_high_n_s16(a, b);
3169 }
3170
3171 // CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
3172 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3173 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3174 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3175 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
3176 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3177 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3178 // CHECK: [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3179 // CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
3180 // CHECK: ret <2 x i64> [[VMULL3_I_I]]
test_vmull_high_n_s32(int32x4_t a,int32_t b)3181 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
3182 return vmull_high_n_s32(a, b);
3183 }
3184
3185 // CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
3186 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3187 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3188 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3189 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
3190 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
3191 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
3192 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3193 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3194 // CHECK: [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3195 // CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
3196 // CHECK: ret <4 x i32> [[VMULL5_I_I]]
test_vmull_high_n_u16(uint16x8_t a,uint16_t b)3197 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
3198 return vmull_high_n_u16(a, b);
3199 }
3200
3201 // CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
3202 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3203 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3204 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3205 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
3206 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3207 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3208 // CHECK: [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3209 // CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
3210 // CHECK: ret <2 x i64> [[VMULL3_I_I]]
test_vmull_high_n_u32(uint32x4_t a,uint32_t b)3211 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
3212 return vmull_high_n_u32(a, b);
3213 }
3214
3215 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
3216 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3217 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3218 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3219 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
3220 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
3221 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
3222 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3223 // CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3224 // CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3225 // CHECK: [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2
3226 // CHECK: [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
3227 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32>
3228 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_high_n_s16(int16x8_t a,int16_t b)3229 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
3230 return vqdmull_high_n_s16(a, b);
3231 }
3232
3233 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
3234 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3235 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3236 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3237 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
3238 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3239 // CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3240 // CHECK: [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3241 // CHECK: [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2
3242 // CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
3243 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64>
3244 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_high_n_s32(int32x4_t a,int32_t b)3245 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
3246 return vqdmull_high_n_s32(a, b);
3247 }
3248
3249 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3250 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3251 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3252 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3253 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3254 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3255 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3256 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3257 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3258 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3259 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3260 // CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
3261 // CHECK: ret <4 x i32> [[ADD_I_I]]
test_vmlal_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)3262 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3263 return vmlal_high_n_s16(a, b, c);
3264 }
3265
3266 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3267 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3268 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3269 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3270 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3271 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3272 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3273 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3274 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3275 // CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
3276 // CHECK: ret <2 x i64> [[ADD_I_I]]
test_vmlal_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)3277 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3278 return vmlal_high_n_s32(a, b, c);
3279 }
3280
3281 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3282 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3283 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3284 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3285 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3286 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3287 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3288 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3289 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3290 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3291 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3292 // CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
3293 // CHECK: ret <4 x i32> [[ADD_I_I]]
test_vmlal_high_n_u16(uint32x4_t a,uint16x8_t b,uint16_t c)3294 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3295 return vmlal_high_n_u16(a, b, c);
3296 }
3297
3298 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3299 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3300 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3301 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3302 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3303 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3304 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3305 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3306 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3307 // CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
3308 // CHECK: ret <2 x i64> [[ADD_I_I]]
test_vmlal_high_n_u32(uint64x2_t a,uint32x4_t b,uint32_t c)3309 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3310 return vmlal_high_n_u32(a, b, c);
3311 }
3312
3313 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3314 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3315 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3316 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3317 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3318 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3319 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3320 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3321 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3322 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3323 // CHECK: [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
3324 // CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
3325 // CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3326 // CHECK: [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
3327 // CHECK: ret <4 x i32> [[VQDMLAL_V6_I_I]]
test_vqdmlal_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)3328 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3329 return vqdmlal_high_n_s16(a, b, c);
3330 }
3331
3332 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3333 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3334 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3335 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3336 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3337 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3338 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3339 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3340 // CHECK: [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
3341 // CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
3342 // CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3343 // CHECK: [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
3344 // CHECK: ret <2 x i64> [[VQDMLAL_V4_I_I]]
test_vqdmlal_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)3345 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3346 return vqdmlal_high_n_s32(a, b, c);
3347 }
3348
3349 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3350 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3351 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3352 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3353 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3354 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3355 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3356 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3357 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3358 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3359 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3360 // CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
3361 // CHECK: ret <4 x i32> [[SUB_I_I]]
test_vmlsl_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)3362 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3363 return vmlsl_high_n_s16(a, b, c);
3364 }
3365
3366 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3367 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3368 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3369 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3370 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3371 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3372 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3373 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3374 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3375 // CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3376 // CHECK: ret <2 x i64> [[SUB_I_I]]
test_vmlsl_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)3377 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3378 return vmlsl_high_n_s32(a, b, c);
3379 }
3380
3381 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3382 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3383 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3384 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3385 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3386 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3387 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3388 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3389 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3390 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3391 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3392 // CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
3393 // CHECK: ret <4 x i32> [[SUB_I_I]]
test_vmlsl_high_n_u16(uint32x4_t a,uint16x8_t b,uint16_t c)3394 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3395 return vmlsl_high_n_u16(a, b, c);
3396 }
3397
3398 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3399 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3400 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3401 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3402 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3403 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3404 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3405 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3406 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3407 // CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3408 // CHECK: ret <2 x i64> [[SUB_I_I]]
test_vmlsl_high_n_u32(uint64x2_t a,uint32x4_t b,uint32_t c)3409 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3410 return vmlsl_high_n_u32(a, b, c);
3411 }
3412
3413 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3414 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3415 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3416 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3417 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3418 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3419 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3420 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3421 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3422 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3423 // CHECK: [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
3424 // CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
3425 // CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3426 // CHECK: [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
3427 // CHECK: ret <4 x i32> [[VQDMLSL_V6_I_I]]
test_vqdmlsl_high_n_s16(int32x4_t a,int16x8_t b,int16_t c)3428 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3429 return vqdmlsl_high_n_s16(a, b, c);
3430 }
3431
3432 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3433 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3434 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3435 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3436 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3437 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3438 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3439 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3440 // CHECK: [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
3441 // CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
3442 // CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3443 // CHECK: [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
3444 // CHECK: ret <2 x i64> [[VQDMLSL_V4_I_I]]
test_vqdmlsl_high_n_s32(int64x2_t a,int32x4_t b,int32_t c)3445 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3446 return vqdmlsl_high_n_s32(a, b, c);
3447 }
3448
3449 // CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
3450 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
3451 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
3452 // CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
3453 // CHECK: ret <2 x float> [[MUL_I]]
test_vmul_n_f32(float32x2_t a,float32_t b)3454 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
3455 return vmul_n_f32(a, b);
3456 }
3457
3458 // CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
3459 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
3460 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
3461 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
3462 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
3463 // CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
3464 // CHECK: ret <4 x float> [[MUL_I]]
test_vmulq_n_f32(float32x4_t a,float32_t b)3465 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
3466 return vmulq_n_f32(a, b);
3467 }
3468
3469 // CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
3470 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
3471 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
3472 // CHECK: [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
3473 // CHECK: ret <2 x double> [[MUL_I]]
test_vmulq_n_f64(float64x2_t a,float64_t b)3474 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
3475 return vmulq_n_f64(a, b);
3476 }
3477
3478 // CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
3479 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3480 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3481 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3482 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3483 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3484 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3485 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3486 // CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3487 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
3488 // CHECK: ret <2 x float> [[TMP6]]
test_vfma_n_f32(float32x2_t a,float32x2_t b,float32_t n)3489 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3490 return vfma_n_f32(a, b, n);
3491 }
3492
3493 // CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
3494 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3495 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3496 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3497 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3498 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3499 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3500 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3501 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3502 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3503 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3504 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
3505 // CHECK: ret <4 x float> [[TMP6]]
test_vfmaq_n_f32(float32x4_t a,float32x4_t b,float32_t n)3506 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3507 return vfmaq_n_f32(a, b, n);
3508 }
3509
3510 // CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
3511 // CHECK: [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3512 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3513 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3514 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3515 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3516 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3517 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3518 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3519 // CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3520 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
3521 // CHECK: ret <2 x float> [[TMP6]]
test_vfms_n_f32(float32x2_t a,float32x2_t b,float32_t n)3522 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3523 return vfms_n_f32(a, b, n);
3524 }
3525
3526 // CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
3527 // CHECK: [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3528 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3529 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3530 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3531 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3532 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3533 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3534 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3535 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3536 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3537 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3538 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
3539 // CHECK: ret <4 x float> [[TMP6]]
test_vfmsq_n_f32(float32x4_t a,float32x4_t b,float32_t n)3540 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3541 return vfmsq_n_f32(a, b, n);
3542 }
3543
3544 // CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
3545 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3546 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3547 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3548 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3549 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3550 // CHECK: ret <4 x i16> [[MUL_I]]
test_vmul_n_s16(int16x4_t a,int16_t b)3551 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
3552 return vmul_n_s16(a, b);
3553 }
3554
3555 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
3556 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3557 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3558 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3559 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3560 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3561 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3562 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3563 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3564 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3565 // CHECK: ret <8 x i16> [[MUL_I]]
test_vmulq_n_s16(int16x8_t a,int16_t b)3566 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
3567 return vmulq_n_s16(a, b);
3568 }
3569
3570 // CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
3571 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3572 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3573 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3574 // CHECK: ret <2 x i32> [[MUL_I]]
test_vmul_n_s32(int32x2_t a,int32_t b)3575 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
3576 return vmul_n_s32(a, b);
3577 }
3578
3579 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
3580 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3581 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3582 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3583 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3584 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3585 // CHECK: ret <4 x i32> [[MUL_I]]
test_vmulq_n_s32(int32x4_t a,int32_t b)3586 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
3587 return vmulq_n_s32(a, b);
3588 }
3589
3590 // CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
3591 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3592 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3593 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3594 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3595 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3596 // CHECK: ret <4 x i16> [[MUL_I]]
test_vmul_n_u16(uint16x4_t a,uint16_t b)3597 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
3598 return vmul_n_u16(a, b);
3599 }
3600
3601 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
3602 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3603 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3604 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3605 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3606 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3607 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3608 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3609 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3610 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3611 // CHECK: ret <8 x i16> [[MUL_I]]
test_vmulq_n_u16(uint16x8_t a,uint16_t b)3612 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
3613 return vmulq_n_u16(a, b);
3614 }
3615
3616 // CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
3617 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3618 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3619 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3620 // CHECK: ret <2 x i32> [[MUL_I]]
test_vmul_n_u32(uint32x2_t a,uint32_t b)3621 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
3622 return vmul_n_u32(a, b);
3623 }
3624
3625 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
3626 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3627 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3628 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3629 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3630 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3631 // CHECK: ret <4 x i32> [[MUL_I]]
test_vmulq_n_u32(uint32x4_t a,uint32_t b)3632 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
3633 return vmulq_n_u32(a, b);
3634 }
3635
3636 // CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
3637 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3638 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3639 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3640 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3641 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3642 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3643 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3644 // CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3645 // CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
3646 // CHECK: ret <4 x i32> [[VMULL5_I]]
test_vmull_n_s16(int16x4_t a,int16_t b)3647 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
3648 return vmull_n_s16(a, b);
3649 }
3650
3651 // CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
3652 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3653 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3654 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3655 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3656 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3657 // CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3658 // CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
3659 // CHECK: ret <2 x i64> [[VMULL3_I]]
test_vmull_n_s32(int32x2_t a,int32_t b)3660 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
3661 return vmull_n_s32(a, b);
3662 }
3663
3664 // CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
3665 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3666 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3667 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3668 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3669 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3670 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3671 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3672 // CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3673 // CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
3674 // CHECK: ret <4 x i32> [[VMULL5_I]]
test_vmull_n_u16(uint16x4_t a,uint16_t b)3675 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
3676 return vmull_n_u16(a, b);
3677 }
3678
3679 // CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
3680 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3681 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3682 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3683 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3684 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3685 // CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3686 // CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
3687 // CHECK: ret <2 x i64> [[VMULL3_I]]
test_vmull_n_u32(uint32x2_t a,uint32_t b)3688 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
3689 return vmull_n_u32(a, b);
3690 }
3691
3692 // CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
3693 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3694 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3695 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3696 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3697 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3698 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3699 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3700 // CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3701 // CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2
3702 // CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
3703 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
3704 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmull_n_s16(int16x4_t a,int16_t b)3705 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
3706 return vqdmull_n_s16(a, b);
3707 }
3708
3709 // CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
3710 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3711 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3712 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3713 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3714 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3715 // CHECK: [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3716 // CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2
3717 // CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
3718 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
3719 // CHECK: ret <2 x i64> [[TMP2]]
test_vqdmull_n_s32(int32x2_t a,int32_t b)3720 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
3721 return vqdmull_n_s32(a, b);
3722 }
3723
3724 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
3725 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3726 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3727 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3728 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3729 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3730 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3731 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3732 // CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3733 // CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2
3734 // CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
3735 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
3736 // CHECK: ret <4 x i16> [[TMP2]]
test_vqdmulh_n_s16(int16x4_t a,int16_t b)3737 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
3738 return vqdmulh_n_s16(a, b);
3739 }
3740
3741 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
3742 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3743 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3744 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3745 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3746 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3747 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3748 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3749 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3750 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3751 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3752 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3753 // CHECK: [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3754 // CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2
3755 // CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
3756 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
3757 // CHECK: ret <8 x i16> [[TMP2]]
test_vqdmulhq_n_s16(int16x8_t a,int16_t b)3758 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
3759 return vqdmulhq_n_s16(a, b);
3760 }
3761
3762 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
3763 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3764 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3765 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3766 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3767 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3768 // CHECK: [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3769 // CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2
3770 // CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
3771 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
3772 // CHECK: ret <2 x i32> [[TMP2]]
test_vqdmulh_n_s32(int32x2_t a,int32_t b)3773 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
3774 return vqdmulh_n_s32(a, b);
3775 }
3776
3777 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
3778 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3779 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3780 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3781 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3782 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3783 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3784 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3785 // CHECK: [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3786 // CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2
3787 // CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
3788 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
3789 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmulhq_n_s32(int32x4_t a,int32_t b)3790 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
3791 return vqdmulhq_n_s32(a, b);
3792 }
3793
3794 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
3795 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3796 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3797 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3798 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3799 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3800 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3801 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3802 // CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3803 // CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2
3804 // CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
3805 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
3806 // CHECK: ret <4 x i16> [[TMP2]]
test_vqrdmulh_n_s16(int16x4_t a,int16_t b)3807 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
3808 return vqrdmulh_n_s16(a, b);
3809 }
3810
3811 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
3812 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3813 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3814 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3815 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3816 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3817 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3818 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3819 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3820 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3821 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3822 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3823 // CHECK: [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3824 // CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2
3825 // CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
3826 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
3827 // CHECK: ret <8 x i16> [[TMP2]]
test_vqrdmulhq_n_s16(int16x8_t a,int16_t b)3828 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
3829 return vqrdmulhq_n_s16(a, b);
3830 }
3831
3832 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
3833 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3834 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3835 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3836 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3837 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3838 // CHECK: [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3839 // CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2
3840 // CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
3841 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
3842 // CHECK: ret <2 x i32> [[TMP2]]
test_vqrdmulh_n_s32(int32x2_t a,int32_t b)3843 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
3844 return vqrdmulh_n_s32(a, b);
3845 }
3846
3847 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
3848 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3849 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3850 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3851 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3852 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3853 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3854 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3855 // CHECK: [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3856 // CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2
3857 // CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
3858 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
3859 // CHECK: ret <4 x i32> [[TMP2]]
test_vqrdmulhq_n_s32(int32x4_t a,int32_t b)3860 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
3861 return vqrdmulhq_n_s32(a, b);
3862 }
3863
3864 // CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
3865 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3866 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3867 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3868 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3869 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3870 // CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3871 // CHECK: ret <4 x i16> [[ADD_I]]
test_vmla_n_s16(int16x4_t a,int16x4_t b,int16_t c)3872 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3873 return vmla_n_s16(a, b, c);
3874 }
3875
3876 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
3877 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3878 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3879 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3880 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3881 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3882 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3883 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3884 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3885 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3886 // CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3887 // CHECK: ret <8 x i16> [[ADD_I]]
test_vmlaq_n_s16(int16x8_t a,int16x8_t b,int16_t c)3888 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3889 return vmlaq_n_s16(a, b, c);
3890 }
3891
3892 // CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
3893 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3894 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3895 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3896 // CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3897 // CHECK: ret <2 x i32> [[ADD_I]]
test_vmla_n_s32(int32x2_t a,int32x2_t b,int32_t c)3898 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3899 return vmla_n_s32(a, b, c);
3900 }
3901
3902 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
3903 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3904 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3905 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3906 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3907 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3908 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3909 // CHECK: ret <4 x i32> [[ADD_I]]
test_vmlaq_n_s32(int32x4_t a,int32x4_t b,int32_t c)3910 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3911 return vmlaq_n_s32(a, b, c);
3912 }
3913
3914 // CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
3915 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3916 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3917 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3918 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3919 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3920 // CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3921 // CHECK: ret <4 x i16> [[ADD_I]]
test_vmla_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)3922 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3923 return vmla_n_u16(a, b, c);
3924 }
3925
3926 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
3927 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3928 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3929 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3930 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3931 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3932 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3933 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3934 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3935 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3936 // CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3937 // CHECK: ret <8 x i16> [[ADD_I]]
test_vmlaq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)3938 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3939 return vmlaq_n_u16(a, b, c);
3940 }
3941
3942 // CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
3943 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3944 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3945 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3946 // CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3947 // CHECK: ret <2 x i32> [[ADD_I]]
test_vmla_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)3948 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3949 return vmla_n_u32(a, b, c);
3950 }
3951
3952 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
3953 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3954 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3955 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3956 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3957 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3958 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3959 // CHECK: ret <4 x i32> [[ADD_I]]
test_vmlaq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)3960 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3961 return vmlaq_n_u32(a, b, c);
3962 }
3963
3964 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
3965 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3966 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3967 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3968 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3969 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3970 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3971 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3972 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3973 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
3974 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3975 // CHECK: ret <4 x i32> [[ADD_I]]
test_vmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)3976 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3977 return vmlal_n_s16(a, b, c);
3978 }
3979
3980 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
3981 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3982 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3983 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3984 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3985 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3986 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3987 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
3988 // CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3989 // CHECK: ret <2 x i64> [[ADD_I]]
test_vmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)3990 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3991 return vmlal_n_s32(a, b, c);
3992 }
3993
3994 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
3995 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3996 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3997 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3998 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3999 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4000 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4001 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4002 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4003 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
4004 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
4005 // CHECK: ret <4 x i32> [[ADD_I]]
test_vmlal_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)4006 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4007 return vmlal_n_u16(a, b, c);
4008 }
4009
4010 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4011 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4012 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4013 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4014 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4015 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4016 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4017 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
4018 // CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
4019 // CHECK: ret <2 x i64> [[ADD_I]]
test_vmlal_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)4020 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4021 return vmlal_n_u32(a, b, c);
4022 }
4023
4024 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4025 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4026 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4027 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4028 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4029 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4030 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4031 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4032 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4033 // CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4034 // CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
4035 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4036 // CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
4037 // CHECK: ret <4 x i32> [[VQDMLAL_V6_I]]
test_vqdmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)4038 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4039 return vqdmlal_n_s16(a, b, c);
4040 }
4041
4042 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4043 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4044 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4045 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4046 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4047 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4048 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4049 // CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4050 // CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
4051 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4052 // CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
4053 // CHECK: ret <2 x i64> [[VQDMLAL_V4_I]]
test_vqdmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)4054 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4055 return vqdmlal_n_s32(a, b, c);
4056 }
4057
4058 // CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
4059 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4060 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4061 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4062 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4063 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
4064 // CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
4065 // CHECK: ret <4 x i16> [[SUB_I]]
test_vmls_n_s16(int16x4_t a,int16x4_t b,int16_t c)4066 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4067 return vmls_n_s16(a, b, c);
4068 }
4069
4070 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
4071 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
4072 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
4073 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
4074 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
4075 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
4076 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
4077 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
4078 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
4079 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
4080 // CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
4081 // CHECK: ret <8 x i16> [[SUB_I]]
test_vmlsq_n_s16(int16x8_t a,int16x8_t b,int16_t c)4082 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4083 return vmlsq_n_s16(a, b, c);
4084 }
4085
4086 // CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
4087 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4088 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4089 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
4090 // CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
4091 // CHECK: ret <2 x i32> [[SUB_I]]
test_vmls_n_s32(int32x2_t a,int32x2_t b,int32_t c)4092 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4093 return vmls_n_s32(a, b, c);
4094 }
4095
4096 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
4097 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
4098 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
4099 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
4100 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
4101 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
4102 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
4103 // CHECK: ret <4 x i32> [[SUB_I]]
test_vmlsq_n_s32(int32x4_t a,int32x4_t b,int32_t c)4104 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4105 return vmlsq_n_s32(a, b, c);
4106 }
4107
4108 // CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
4109 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4110 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4111 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4112 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4113 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
4114 // CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
4115 // CHECK: ret <4 x i16> [[SUB_I]]
test_vmls_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)4116 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4117 return vmls_n_u16(a, b, c);
4118 }
4119
4120 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
4121 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
4122 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
4123 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
4124 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
4125 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
4126 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
4127 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
4128 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
4129 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
4130 // CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
4131 // CHECK: ret <8 x i16> [[SUB_I]]
test_vmlsq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)4132 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4133 return vmlsq_n_u16(a, b, c);
4134 }
4135
4136 // CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
4137 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4138 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4139 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
4140 // CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
4141 // CHECK: ret <2 x i32> [[SUB_I]]
test_vmls_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)4142 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4143 return vmls_n_u32(a, b, c);
4144 }
4145
4146 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
4147 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
4148 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
4149 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
4150 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
4151 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
4152 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
4153 // CHECK: ret <4 x i32> [[SUB_I]]
test_vmlsq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)4154 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4155 return vmlsq_n_u32(a, b, c);
4156 }
4157
4158 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4159 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4160 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4161 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4162 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4163 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4164 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4165 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4166 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4167 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
4168 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
4169 // CHECK: ret <4 x i32> [[SUB_I]]
test_vmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)4170 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4171 return vmlsl_n_s16(a, b, c);
4172 }
4173
4174 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4175 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4176 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4177 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4178 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4179 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4180 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4181 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
4182 // CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
4183 // CHECK: ret <2 x i64> [[SUB_I]]
test_vmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)4184 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4185 return vmlsl_n_s32(a, b, c);
4186 }
4187
4188 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4189 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4190 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4191 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4192 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4193 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4194 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4195 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4196 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4197 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
4198 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
4199 // CHECK: ret <4 x i32> [[SUB_I]]
test_vmlsl_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)4200 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4201 return vmlsl_n_u16(a, b, c);
4202 }
4203
4204 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4205 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4206 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4207 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4208 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4209 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4210 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4211 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
4212 // CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
4213 // CHECK: ret <2 x i64> [[SUB_I]]
test_vmlsl_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)4214 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4215 return vmlsl_n_u32(a, b, c);
4216 }
4217
4218 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4219 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4220 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4221 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4222 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4223 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4224 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4225 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4226 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4227 // CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4228 // CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
4229 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4230 // CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
4231 // CHECK: ret <4 x i32> [[VQDMLSL_V6_I]]
test_vqdmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)4232 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4233 return vqdmlsl_n_s16(a, b, c);
4234 }
4235
4236 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4237 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4238 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4239 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4240 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4241 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4242 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4243 // CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4244 // CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
4245 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4246 // CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
4247 // CHECK: ret <2 x i64> [[VQDMLSL_V4_I]]
test_vqdmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)4248 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4249 return vqdmlsl_n_s32(a, b, c);
4250 }
4251
4252 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4253 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
4254 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4255 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4256 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_lane_u16_0(uint16x4_t a,uint16x4_t b,uint16x4_t v)4257 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4258 return vmla_lane_u16(a, b, v, 0);
4259 }
4260
4261 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4262 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
4263 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4264 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4265 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_lane_u16_0(uint16x8_t a,uint16x8_t b,uint16x4_t v)4266 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4267 return vmlaq_lane_u16(a, b, v, 0);
4268 }
4269
4270 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4271 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
4272 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4273 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4274 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_lane_u32_0(uint32x2_t a,uint32x2_t b,uint32x2_t v)4275 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4276 return vmla_lane_u32(a, b, v, 0);
4277 }
4278
4279 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4280 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
4281 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4282 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4283 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_lane_u32_0(uint32x4_t a,uint32x4_t b,uint32x2_t v)4284 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4285 return vmlaq_lane_u32(a, b, v, 0);
4286 }
4287
4288 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4289 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4290 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4291 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4292 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_laneq_u16_0(uint16x4_t a,uint16x4_t b,uint16x8_t v)4293 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4294 return vmla_laneq_u16(a, b, v, 0);
4295 }
4296
4297 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4298 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4299 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4300 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4301 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_laneq_u16_0(uint16x8_t a,uint16x8_t b,uint16x8_t v)4302 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4303 return vmlaq_laneq_u16(a, b, v, 0);
4304 }
4305
4306 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4307 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4308 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4309 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4310 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_laneq_u32_0(uint32x2_t a,uint32x2_t b,uint32x4_t v)4311 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4312 return vmla_laneq_u32(a, b, v, 0);
4313 }
4314
4315 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4316 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4317 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4318 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4319 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_laneq_u32_0(uint32x4_t a,uint32x4_t b,uint32x4_t v)4320 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4321 return vmlaq_laneq_u32(a, b, v, 0);
4322 }
4323
4324 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4325 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4326 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4327 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4328 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4329 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4330 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4331 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4332 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4333 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4334 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)4335 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
4336 return vqdmlal_laneq_s16(a, b, v, 0);
4337 }
4338
4339 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4340 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4341 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4342 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4343 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4344 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4345 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4346 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4347 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4348 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4349 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)4350 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
4351 return vqdmlal_laneq_s32(a, b, v, 0);
4352 }
4353
4354 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4355 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4356 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4357 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4358 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4359 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4360 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4361 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4362 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4363 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4364 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4365 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)4366 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
4367 return vqdmlal_high_laneq_s16(a, b, v, 0);
4368 }
4369
4370 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4371 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4372 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4373 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4374 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4375 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4376 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4377 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4378 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4379 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4380 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4381 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)4382 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4383 return vqdmlal_high_laneq_s32(a, b, v, 0);
4384 }
4385
4386 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4387 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
4388 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4389 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4390 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_lane_u16_0(uint16x4_t a,uint16x4_t b,uint16x4_t v)4391 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4392 return vmls_lane_u16(a, b, v, 0);
4393 }
4394
4395 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4396 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
4397 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4398 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4399 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_lane_u16_0(uint16x8_t a,uint16x8_t b,uint16x4_t v)4400 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4401 return vmlsq_lane_u16(a, b, v, 0);
4402 }
4403
4404 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4405 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
4406 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4407 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4408 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_lane_u32_0(uint32x2_t a,uint32x2_t b,uint32x2_t v)4409 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4410 return vmls_lane_u32(a, b, v, 0);
4411 }
4412
4413 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4414 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
4415 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4416 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4417 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_lane_u32_0(uint32x4_t a,uint32x4_t b,uint32x2_t v)4418 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4419 return vmlsq_lane_u32(a, b, v, 0);
4420 }
4421
4422 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4423 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4424 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4425 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4426 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_laneq_u16_0(uint16x4_t a,uint16x4_t b,uint16x8_t v)4427 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4428 return vmls_laneq_u16(a, b, v, 0);
4429 }
4430
4431 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4432 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4433 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4434 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4435 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_laneq_u16_0(uint16x8_t a,uint16x8_t b,uint16x8_t v)4436 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4437 return vmlsq_laneq_u16(a, b, v, 0);
4438 }
4439
4440 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4441 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4442 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4443 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4444 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_laneq_u32_0(uint32x2_t a,uint32x2_t b,uint32x4_t v)4445 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4446 return vmls_laneq_u32(a, b, v, 0);
4447 }
4448
4449 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4450 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4451 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4452 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4453 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_laneq_u32_0(uint32x4_t a,uint32x4_t b,uint32x4_t v)4454 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4455 return vmlsq_laneq_u32(a, b, v, 0);
4456 }
4457
4458 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4459 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4460 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4461 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4462 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4463 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4464 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4465 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4466 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4467 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4468 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_laneq_s16_0(int32x4_t a,int16x4_t b,int16x8_t v)4469 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
4470 return vqdmlsl_laneq_s16(a, b, v, 0);
4471 }
4472
4473 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4474 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4475 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4476 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4477 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4478 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4479 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4480 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4481 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4482 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4483 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_laneq_s32_0(int64x2_t a,int32x2_t b,int32x4_t v)4484 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
4485 return vqdmlsl_laneq_s32(a, b, v, 0);
4486 }
4487
4488 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4489 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4490 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4491 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4492 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4493 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4494 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4495 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4496 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4497 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4498 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4499 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_laneq_s16_0(int32x4_t a,int16x8_t b,int16x8_t v)4500 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
4501 return vqdmlsl_high_laneq_s16(a, b, v, 0);
4502 }
4503
4504 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4505 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4506 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4507 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4508 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4509 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4510 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4511 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4512 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4513 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4514 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4515 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_laneq_s32_0(int64x2_t a,int32x4_t b,int32x4_t v)4516 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4517 return vqdmlsl_high_laneq_s32(a, b, v, 0);
4518 }
4519
4520 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
4521 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4522 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4523 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4524 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4525 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4526 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
4527 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4528 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
4529 // CHECK: ret <4 x i16> [[TMP2]]
test_vqdmulh_laneq_s16_0(int16x4_t a,int16x8_t v)4530 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4531 return vqdmulh_laneq_s16(a, v, 0);
4532 }
4533
4534 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
4535 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4536 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4537 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4538 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4539 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4540 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
4541 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4542 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
4543 // CHECK: ret <8 x i16> [[TMP2]]
test_vqdmulhq_laneq_s16_0(int16x8_t a,int16x8_t v)4544 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4545 return vqdmulhq_laneq_s16(a, v, 0);
4546 }
4547
4548 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
4549 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4550 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4551 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4552 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4553 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4554 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
4555 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4556 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
4557 // CHECK: ret <2 x i32> [[TMP2]]
test_vqdmulh_laneq_s32_0(int32x2_t a,int32x4_t v)4558 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4559 return vqdmulh_laneq_s32(a, v, 0);
4560 }
4561
4562 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
4563 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4564 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4565 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4566 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4567 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4568 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
4569 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4570 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
4571 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmulhq_laneq_s32_0(int32x4_t a,int32x4_t v)4572 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4573 return vqdmulhq_laneq_s32(a, v, 0);
4574 }
4575
4576 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
4577 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4578 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4579 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4580 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4581 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4582 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
4583 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4584 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
4585 // CHECK: ret <4 x i16> [[TMP2]]
test_vqrdmulh_laneq_s16_0(int16x4_t a,int16x8_t v)4586 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4587 return vqrdmulh_laneq_s16(a, v, 0);
4588 }
4589
4590 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
4591 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4592 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4593 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4594 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4595 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4596 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
4597 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4598 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
4599 // CHECK: ret <8 x i16> [[TMP2]]
test_vqrdmulhq_laneq_s16_0(int16x8_t a,int16x8_t v)4600 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4601 return vqrdmulhq_laneq_s16(a, v, 0);
4602 }
4603
4604 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
4605 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4606 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4607 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4608 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4609 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4610 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
4611 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4612 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
4613 // CHECK: ret <2 x i32> [[TMP2]]
test_vqrdmulh_laneq_s32_0(int32x2_t a,int32x4_t v)4614 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4615 return vqrdmulh_laneq_s32(a, v, 0);
4616 }
4617
4618 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
4619 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4620 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4621 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4622 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4623 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4624 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
4625 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4626 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
4627 // CHECK: ret <4 x i32> [[TMP2]]
test_vqrdmulhq_laneq_s32_0(int32x4_t a,int32x4_t v)4628 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4629 return vqrdmulhq_laneq_s32(a, v, 0);
4630 }
4631
4632 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4633 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4634 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4635 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4636 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_lane_u16(uint16x4_t a,uint16x4_t b,uint16x4_t v)4637 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4638 return vmla_lane_u16(a, b, v, 3);
4639 }
4640
4641 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4642 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4643 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4644 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4645 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_lane_u16(uint16x8_t a,uint16x8_t b,uint16x4_t v)4646 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4647 return vmlaq_lane_u16(a, b, v, 3);
4648 }
4649
4650 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4651 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4652 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4653 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4654 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_lane_u32(uint32x2_t a,uint32x2_t b,uint32x2_t v)4655 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4656 return vmla_lane_u32(a, b, v, 1);
4657 }
4658
4659 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4660 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4661 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4662 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4663 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_lane_u32(uint32x4_t a,uint32x4_t b,uint32x2_t v)4664 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4665 return vmlaq_lane_u32(a, b, v, 1);
4666 }
4667
4668 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4669 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4670 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4671 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4672 // CHECK: ret <4 x i16> [[ADD]]
test_vmla_laneq_u16(uint16x4_t a,uint16x4_t b,uint16x8_t v)4673 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4674 return vmla_laneq_u16(a, b, v, 7);
4675 }
4676
4677 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4678 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4679 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4680 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4681 // CHECK: ret <8 x i16> [[ADD]]
test_vmlaq_laneq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t v)4682 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4683 return vmlaq_laneq_u16(a, b, v, 7);
4684 }
4685
4686 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4687 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4688 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4689 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4690 // CHECK: ret <2 x i32> [[ADD]]
test_vmla_laneq_u32(uint32x2_t a,uint32x2_t b,uint32x4_t v)4691 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4692 return vmla_laneq_u32(a, b, v, 3);
4693 }
4694
4695 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4696 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4697 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4698 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4699 // CHECK: ret <4 x i32> [[ADD]]
test_vmlaq_laneq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t v)4700 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4701 return vmlaq_laneq_u32(a, b, v, 3);
4702 }
4703
4704 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4705 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4706 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4707 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4708 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4709 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4710 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4711 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4712 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4713 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4714 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)4715 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4716 return vqdmlal_laneq_s16(a, b, v, 7);
4717 }
4718
4719 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4720 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4721 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4722 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4723 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4724 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4725 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4726 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4727 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4728 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4729 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)4730 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4731 return vqdmlal_laneq_s32(a, b, v, 3);
4732 }
4733
4734 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4735 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4736 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4737 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4738 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4739 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4740 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4741 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4742 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4743 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4744 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4745 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)4746 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4747 return vqdmlal_high_laneq_s16(a, b, v, 7);
4748 }
4749
4750 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4751 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4752 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4753 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4754 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4755 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4756 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4757 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4758 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4759 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4760 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4761 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)4762 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4763 return vqdmlal_high_laneq_s32(a, b, v, 3);
4764 }
4765
4766 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4767 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4768 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4769 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4770 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_lane_u16(uint16x4_t a,uint16x4_t b,uint16x4_t v)4771 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4772 return vmls_lane_u16(a, b, v, 3);
4773 }
4774
4775 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4776 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4777 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4778 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4779 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_lane_u16(uint16x8_t a,uint16x8_t b,uint16x4_t v)4780 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4781 return vmlsq_lane_u16(a, b, v, 3);
4782 }
4783
4784 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4785 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4786 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4787 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4788 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_lane_u32(uint32x2_t a,uint32x2_t b,uint32x2_t v)4789 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4790 return vmls_lane_u32(a, b, v, 1);
4791 }
4792
4793 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4794 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4795 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4796 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4797 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_lane_u32(uint32x4_t a,uint32x4_t b,uint32x2_t v)4798 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4799 return vmlsq_lane_u32(a, b, v, 1);
4800 }
4801
4802 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4803 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4804 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4805 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4806 // CHECK: ret <4 x i16> [[SUB]]
test_vmls_laneq_u16(uint16x4_t a,uint16x4_t b,uint16x8_t v)4807 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4808 return vmls_laneq_u16(a, b, v, 7);
4809 }
4810
4811 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4812 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4813 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4814 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4815 // CHECK: ret <8 x i16> [[SUB]]
test_vmlsq_laneq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t v)4816 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4817 return vmlsq_laneq_u16(a, b, v, 7);
4818 }
4819
4820 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4821 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4822 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4823 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4824 // CHECK: ret <2 x i32> [[SUB]]
test_vmls_laneq_u32(uint32x2_t a,uint32x2_t b,uint32x4_t v)4825 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4826 return vmls_laneq_u32(a, b, v, 3);
4827 }
4828
4829 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4830 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4831 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4832 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4833 // CHECK: ret <4 x i32> [[SUB]]
test_vmlsq_laneq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t v)4834 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4835 return vmlsq_laneq_u32(a, b, v, 3);
4836 }
4837
4838 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4839 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4840 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4841 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4842 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4843 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4844 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4845 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4846 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4847 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4848 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_laneq_s16(int32x4_t a,int16x4_t b,int16x8_t v)4849 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4850 return vqdmlsl_laneq_s16(a, b, v, 7);
4851 }
4852
4853 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4854 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4855 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4856 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4857 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4858 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4859 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4860 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4861 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4862 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4863 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_laneq_s32(int64x2_t a,int32x2_t b,int32x4_t v)4864 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4865 return vqdmlsl_laneq_s32(a, b, v, 3);
4866 }
4867
4868 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4869 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4870 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4871 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4872 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4873 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4874 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4875 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4876 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4877 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4878 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4879 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_laneq_s16(int32x4_t a,int16x8_t b,int16x8_t v)4880 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4881 return vqdmlsl_high_laneq_s16(a, b, v, 7);
4882 }
4883
4884 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4885 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4886 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4887 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4888 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4889 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4890 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4891 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4892 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4893 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4894 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4895 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_high_laneq_s32(int64x2_t a,int32x4_t b,int32x4_t v)4896 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4897 return vqdmlsl_high_laneq_s32(a, b, v, 3);
4898 }
4899
4900 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
4901 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4902 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4903 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4904 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4905 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4906 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
4907 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4908 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
4909 // CHECK: ret <4 x i16> [[TMP2]]
test_vqdmulh_laneq_s16(int16x4_t a,int16x8_t v)4910 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4911 return vqdmulh_laneq_s16(a, v, 7);
4912 }
4913
4914 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
4915 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4916 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4917 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4918 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4919 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4920 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
4921 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4922 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
4923 // CHECK: ret <8 x i16> [[TMP2]]
test_vqdmulhq_laneq_s16(int16x8_t a,int16x8_t v)4924 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4925 return vqdmulhq_laneq_s16(a, v, 7);
4926 }
4927
4928 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
4929 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4930 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4931 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4932 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4933 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4934 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
4935 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4936 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
4937 // CHECK: ret <2 x i32> [[TMP2]]
test_vqdmulh_laneq_s32(int32x2_t a,int32x4_t v)4938 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4939 return vqdmulh_laneq_s32(a, v, 3);
4940 }
4941
4942 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
4943 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4944 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4945 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4946 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4947 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4948 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
4949 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4950 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
4951 // CHECK: ret <4 x i32> [[TMP2]]
test_vqdmulhq_laneq_s32(int32x4_t a,int32x4_t v)4952 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4953 return vqdmulhq_laneq_s32(a, v, 3);
4954 }
4955
4956 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
4957 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4958 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4959 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4960 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4961 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4962 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
4963 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4964 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
4965 // CHECK: ret <4 x i16> [[TMP2]]
test_vqrdmulh_laneq_s16(int16x4_t a,int16x8_t v)4966 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4967 return vqrdmulh_laneq_s16(a, v, 7);
4968 }
4969
4970 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
4971 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4972 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4973 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4974 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4975 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4976 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
4977 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4978 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
4979 // CHECK: ret <8 x i16> [[TMP2]]
test_vqrdmulhq_laneq_s16(int16x8_t a,int16x8_t v)4980 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4981 return vqrdmulhq_laneq_s16(a, v, 7);
4982 }
4983
4984 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
4985 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4986 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4987 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4988 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4989 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4990 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
4991 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4992 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
4993 // CHECK: ret <2 x i32> [[TMP2]]
test_vqrdmulh_laneq_s32(int32x2_t a,int32x4_t v)4994 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4995 return vqrdmulh_laneq_s32(a, v, 3);
4996 }
4997
4998 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
4999 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5000 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
5001 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
5002 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5003 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5004 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
5005 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
5006 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
5007 // CHECK: ret <4 x i32> [[TMP2]]
test_vqrdmulhq_laneq_s32(int32x4_t a,int32x4_t v)5008 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5009 return vqrdmulhq_laneq_s32(a, v, 3);
5010 }
5011
5012