xref: /aosp_15_r20/external/clang/test/CodeGen/arm_neon_intrinsics.c (revision 67e74705e28f6214e480b399dd47ea732279e315)
1 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
2 // RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \
3 // RUN:  | opt -S -mem2reg | FileCheck %s
4 
5 // REQUIRES: long-tests
6 
7 #include <arm_neon.h>
8 
9 // CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
10 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
11 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
12 // CHECK:   ret <8 x i8> [[ADD_I]]
test_vaba_s8(int8x8_t a,int8x8_t b,int8x8_t c)13 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
14   return vaba_s8(a, b, c);
15 }
16 
17 // CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
18 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
20 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
21 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
23 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
24 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
25 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
26 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vaba_s16(int16x4_t a,int16x4_t b,int16x4_t c)27 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
28   return vaba_s16(a, b, c);
29 }
30 
31 // CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
32 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
33 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
34 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
35 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
36 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
37 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
38 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
39 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
40 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vaba_s32(int32x2_t a,int32x2_t b,int32x2_t c)41 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
42   return vaba_s32(a, b, c);
43 }
44 
45 // CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
46 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
47 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
48 // CHECK:   ret <8 x i8> [[ADD_I]]
test_vaba_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)49 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
50   return vaba_u8(a, b, c);
51 }
52 
53 // CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
54 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
55 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
56 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
57 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
58 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
59 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
60 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
61 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
62 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vaba_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)63 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
64   return vaba_u16(a, b, c);
65 }
66 
67 // CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
68 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
69 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
70 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
71 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
72 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
73 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
74 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
75 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
76 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vaba_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)77 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
78   return vaba_u32(a, b, c);
79 }
80 
81 // CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
82 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) #4
83 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
84 // CHECK:   ret <16 x i8> [[ADD_I]]
test_vabaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)85 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
86   return vabaq_s8(a, b, c);
87 }
88 
89 // CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
90 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
91 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
92 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
93 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
94 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
95 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
96 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
97 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
98 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vabaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)99 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
100   return vabaq_s16(a, b, c);
101 }
102 
103 // CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
104 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
105 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
106 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
107 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
108 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
109 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
110 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
111 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
112 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vabaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)113 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
114   return vabaq_s32(a, b, c);
115 }
116 
117 // CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
118 // CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) #4
119 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
120 // CHECK:   ret <16 x i8> [[ADD_I]]
test_vabaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)121 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
122   return vabaq_u8(a, b, c);
123 }
124 
125 // CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
126 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
127 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
128 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
129 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
130 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
131 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
132 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
133 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
134 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vabaq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t c)135 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
136   return vabaq_u16(a, b, c);
137 }
138 
139 // CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
140 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
141 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
142 // CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
143 // CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
144 // CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
145 // CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
146 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
147 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
148 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vabaq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t c)149 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
150   return vabaq_u32(a, b, c);
151 }
152 
153 
154 // CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
155 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
156 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
157 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
158 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vabal_s8(int16x8_t a,int8x8_t b,int8x8_t c)159 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
160   return vabal_s8(a, b, c);
161 }
162 
163 // CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
164 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
165 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
166 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
167 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
168 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
169 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
170 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
171 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
172 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
173 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
174 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
175 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vabal_s16(int32x4_t a,int16x4_t b,int16x4_t c)176 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
177   return vabal_s16(a, b, c);
178 }
179 
180 // CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
181 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
182 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
183 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
184 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
185 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
186 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
187 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
188 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
189 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
190 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
191 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
192 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vabal_s32(int64x2_t a,int32x2_t b,int32x2_t c)193 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
194   return vabal_s32(a, b, c);
195 }
196 
197 // CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
198 // CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
199 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
200 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
201 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vabal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)202 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
203   return vabal_u8(a, b, c);
204 }
205 
206 // CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
207 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
208 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
209 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
210 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
211 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
212 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
213 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
214 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
215 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
216 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
217 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
218 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vabal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)219 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
220   return vabal_u16(a, b, c);
221 }
222 
223 // CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
224 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
225 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
226 // CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
227 // CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
228 // CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
229 // CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
230 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
231 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
232 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
233 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
234 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
235 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vabal_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)236 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
237   return vabal_u32(a, b, c);
238 }
239 
240 
241 // CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
242 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
243 // CHECK:   ret <8 x i8> [[VABD_V_I]]
test_vabd_s8(int8x8_t a,int8x8_t b)244 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
245   return vabd_s8(a, b);
246 }
247 
248 // CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
249 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
250 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
251 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
252 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
253 // CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
254 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
255 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
256 // CHECK:   ret <4 x i16> [[TMP2]]
test_vabd_s16(int16x4_t a,int16x4_t b)257 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
258   return vabd_s16(a, b);
259 }
260 
261 // CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
262 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
263 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
264 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
265 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
266 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
267 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
268 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
269 // CHECK:   ret <2 x i32> [[TMP2]]
test_vabd_s32(int32x2_t a,int32x2_t b)270 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
271   return vabd_s32(a, b);
272 }
273 
274 // CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
275 // CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
276 // CHECK:   ret <8 x i8> [[VABD_V_I]]
test_vabd_u8(uint8x8_t a,uint8x8_t b)277 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
278   return vabd_u8(a, b);
279 }
280 
281 // CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
282 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
283 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
284 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
285 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
286 // CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
287 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
288 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
289 // CHECK:   ret <4 x i16> [[TMP2]]
test_vabd_u16(uint16x4_t a,uint16x4_t b)290 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
291   return vabd_u16(a, b);
292 }
293 
294 // CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
295 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
296 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
297 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
298 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
299 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
300 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
301 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
302 // CHECK:   ret <2 x i32> [[TMP2]]
test_vabd_u32(uint32x2_t a,uint32x2_t b)303 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
304   return vabd_u32(a, b);
305 }
306 
307 // CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 {
308 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
309 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
310 // CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
311 // CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
312 // CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[VABD_V_I]], <2 x float> [[VABD_V1_I]]) #4
313 // CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
314 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x float>
315 // CHECK:   ret <2 x float> [[TMP2]]
test_vabd_f32(float32x2_t a,float32x2_t b)316 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
317   return vabd_f32(a, b);
318 }
319 
320 // CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
321 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
322 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
test_vabdq_s8(int8x16_t a,int8x16_t b)323 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
324   return vabdq_s8(a, b);
325 }
326 
327 // CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
328 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
329 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
330 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
331 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
332 // CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
333 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
334 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
335 // CHECK:   ret <8 x i16> [[TMP2]]
test_vabdq_s16(int16x8_t a,int16x8_t b)336 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
337   return vabdq_s16(a, b);
338 }
339 
340 // CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
341 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
342 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
343 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
344 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
345 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
346 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
347 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
348 // CHECK:   ret <4 x i32> [[TMP2]]
test_vabdq_s32(int32x4_t a,int32x4_t b)349 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
350   return vabdq_s32(a, b);
351 }
352 
353 // CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
354 // CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
355 // CHECK:   ret <16 x i8> [[VABDQ_V_I]]
test_vabdq_u8(uint8x16_t a,uint8x16_t b)356 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
357   return vabdq_u8(a, b);
358 }
359 
360 // CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
361 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
362 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
363 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
364 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
365 // CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
366 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
367 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
368 // CHECK:   ret <8 x i16> [[TMP2]]
test_vabdq_u16(uint16x8_t a,uint16x8_t b)369 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
370   return vabdq_u16(a, b);
371 }
372 
373 // CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
374 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
375 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
376 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
377 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
378 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
379 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
380 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
381 // CHECK:   ret <4 x i32> [[TMP2]]
test_vabdq_u32(uint32x4_t a,uint32x4_t b)382 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
383   return vabdq_u32(a, b);
384 }
385 
386 // CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 {
387 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
388 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
389 // CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
390 // CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
391 // CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[VABDQ_V_I]], <4 x float> [[VABDQ_V1_I]]) #4
392 // CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
393 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x float>
394 // CHECK:   ret <4 x float> [[TMP2]]
test_vabdq_f32(float32x4_t a,float32x4_t b)395 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
396   return vabdq_f32(a, b);
397 }
398 
399 
400 // CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
401 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
402 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
403 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
test_vabdl_s8(int8x8_t a,int8x8_t b)404 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
405   return vabdl_s8(a, b);
406 }
407 
408 // CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
409 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
410 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
411 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
412 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
413 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
414 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
415 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
416 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
417 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
418 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
419 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
test_vabdl_s16(int16x4_t a,int16x4_t b)420 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
421   return vabdl_s16(a, b);
422 }
423 
424 // CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
425 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
426 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
427 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
428 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
429 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
430 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
431 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
432 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
433 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
434 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
435 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
test_vabdl_s32(int32x2_t a,int32x2_t b)436 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
437   return vabdl_s32(a, b);
438 }
439 
440 // CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
441 // CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
442 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
443 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
test_vabdl_u8(uint8x8_t a,uint8x8_t b)444 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
445   return vabdl_u8(a, b);
446 }
447 
448 // CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
449 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
450 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
451 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
452 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
453 // CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
454 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
455 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
456 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
457 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
458 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
459 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
test_vabdl_u16(uint16x4_t a,uint16x4_t b)460 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
461   return vabdl_u16(a, b);
462 }
463 
464 // CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
465 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
466 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
467 // CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
468 // CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
469 // CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
470 // CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
471 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
472 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
473 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
474 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
475 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
test_vabdl_u32(uint32x2_t a,uint32x2_t b)476 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
477   return vabdl_u32(a, b);
478 }
479 
480 
481 // CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
482 // CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
483 // CHECK:   ret <8 x i8> [[VABS_I]]
test_vabs_s8(int8x8_t a)484 int8x8_t test_vabs_s8(int8x8_t a) {
485   return vabs_s8(a);
486 }
487 
488 // CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
489 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
490 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
491 // CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[VABS_I]]) #4
492 // CHECK:   ret <4 x i16> [[VABS1_I]]
test_vabs_s16(int16x4_t a)493 int16x4_t test_vabs_s16(int16x4_t a) {
494   return vabs_s16(a);
495 }
496 
497 // CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
498 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
499 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
500 // CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[VABS_I]]) #4
501 // CHECK:   ret <2 x i32> [[VABS1_I]]
test_vabs_s32(int32x2_t a)502 int32x2_t test_vabs_s32(int32x2_t a) {
503   return vabs_s32(a);
504 }
505 
506 // CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
507 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
508 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
509 // CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4
510 // CHECK:   ret <2 x float> [[VABS1_I]]
test_vabs_f32(float32x2_t a)511 float32x2_t test_vabs_f32(float32x2_t a) {
512   return vabs_f32(a);
513 }
514 
515 // CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
516 // CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
517 // CHECK:   ret <16 x i8> [[VABS_I]]
test_vabsq_s8(int8x16_t a)518 int8x16_t test_vabsq_s8(int8x16_t a) {
519   return vabsq_s8(a);
520 }
521 
522 // CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
523 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
524 // CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
525 // CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[VABS_I]]) #4
526 // CHECK:   ret <8 x i16> [[VABS1_I]]
test_vabsq_s16(int16x8_t a)527 int16x8_t test_vabsq_s16(int16x8_t a) {
528   return vabsq_s16(a);
529 }
530 
531 // CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
532 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
533 // CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
534 // CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[VABS_I]]) #4
535 // CHECK:   ret <4 x i32> [[VABS1_I]]
test_vabsq_s32(int32x4_t a)536 int32x4_t test_vabsq_s32(int32x4_t a) {
537   return vabsq_s32(a);
538 }
539 
540 // CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
541 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
542 // CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
543 // CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4
544 // CHECK:   ret <4 x float> [[VABS1_I]]
test_vabsq_f32(float32x4_t a)545 float32x4_t test_vabsq_f32(float32x4_t a) {
546   return vabsq_f32(a);
547 }
548 
549 
550 // CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
551 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
552 // CHECK:   ret <8 x i8> [[ADD_I]]
test_vadd_s8(int8x8_t a,int8x8_t b)553 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
554   return vadd_s8(a, b);
555 }
556 
557 // CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
558 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
559 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vadd_s16(int16x4_t a,int16x4_t b)560 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
561   return vadd_s16(a, b);
562 }
563 
564 // CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
565 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
566 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vadd_s32(int32x2_t a,int32x2_t b)567 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
568   return vadd_s32(a, b);
569 }
570 
571 // CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
572 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
573 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vadd_s64(int64x1_t a,int64x1_t b)574 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
575   return vadd_s64(a, b);
576 }
577 
578 // CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 {
579 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
580 // CHECK:   ret <2 x float> [[ADD_I]]
test_vadd_f32(float32x2_t a,float32x2_t b)581 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
582   return vadd_f32(a, b);
583 }
584 
585 // CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
586 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
587 // CHECK:   ret <8 x i8> [[ADD_I]]
test_vadd_u8(uint8x8_t a,uint8x8_t b)588 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
589   return vadd_u8(a, b);
590 }
591 
592 // CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
593 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
594 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vadd_u16(uint16x4_t a,uint16x4_t b)595 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
596   return vadd_u16(a, b);
597 }
598 
599 // CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
600 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
601 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vadd_u32(uint32x2_t a,uint32x2_t b)602 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
603   return vadd_u32(a, b);
604 }
605 
606 // CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
607 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
608 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vadd_u64(uint64x1_t a,uint64x1_t b)609 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
610   return vadd_u64(a, b);
611 }
612 
613 // CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
614 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
615 // CHECK:   ret <16 x i8> [[ADD_I]]
test_vaddq_s8(int8x16_t a,int8x16_t b)616 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
617   return vaddq_s8(a, b);
618 }
619 
620 // CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
621 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
622 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vaddq_s16(int16x8_t a,int16x8_t b)623 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
624   return vaddq_s16(a, b);
625 }
626 
627 // CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
628 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
629 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vaddq_s32(int32x4_t a,int32x4_t b)630 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
631   return vaddq_s32(a, b);
632 }
633 
634 // CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
635 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
636 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vaddq_s64(int64x2_t a,int64x2_t b)637 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
638   return vaddq_s64(a, b);
639 }
640 
641 // CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
642 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
643 // CHECK:   ret <4 x float> [[ADD_I]]
test_vaddq_f32(float32x4_t a,float32x4_t b)644 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
645   return vaddq_f32(a, b);
646 }
647 
648 // CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
649 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
650 // CHECK:   ret <16 x i8> [[ADD_I]]
test_vaddq_u8(uint8x16_t a,uint8x16_t b)651 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
652   return vaddq_u8(a, b);
653 }
654 
655 // CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
656 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
657 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vaddq_u16(uint16x8_t a,uint16x8_t b)658 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
659   return vaddq_u16(a, b);
660 }
661 
662 // CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
663 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
664 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vaddq_u32(uint32x4_t a,uint32x4_t b)665 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
666   return vaddq_u32(a, b);
667 }
668 
669 // CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
670 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
671 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vaddq_u64(uint64x2_t a,uint64x2_t b)672 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
673   return vaddq_u64(a, b);
674 }
675 
676 
677 // CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
678 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
679 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
680 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
681 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
682 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
683 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
684 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
685 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
test_vaddhn_s16(int16x8_t a,int16x8_t b)686 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
687   return vaddhn_s16(a, b);
688 }
689 
690 // CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
691 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
692 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
693 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
694 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
695 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
696 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
697 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
698 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
test_vaddhn_s32(int32x4_t a,int32x4_t b)699 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
700   return vaddhn_s32(a, b);
701 }
702 
703 // CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
704 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
705 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
706 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
707 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
708 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
709 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
710 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
711 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
test_vaddhn_s64(int64x2_t a,int64x2_t b)712 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
713   return vaddhn_s64(a, b);
714 }
715 
716 // CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
717 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
718 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
719 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
720 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
721 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
722 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
723 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
724 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
test_vaddhn_u16(uint16x8_t a,uint16x8_t b)725 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
726   return vaddhn_u16(a, b);
727 }
728 
729 // CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
730 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
731 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
732 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
733 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
734 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
735 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
736 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
737 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
test_vaddhn_u32(uint32x4_t a,uint32x4_t b)738 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
739   return vaddhn_u32(a, b);
740 }
741 
742 // CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
743 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
744 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
745 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
746 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
747 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
748 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
749 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
750 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
test_vaddhn_u64(uint64x2_t a,uint64x2_t b)751 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
752   return vaddhn_u64(a, b);
753 }
754 
755 
756 // CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
757 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
758 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
759 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
760 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vaddl_s8(int8x8_t a,int8x8_t b)761 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
762   return vaddl_s8(a, b);
763 }
764 
765 // CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
766 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
767 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
768 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
769 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
770 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
771 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
772 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
773 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vaddl_s16(int16x4_t a,int16x4_t b)774 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
775   return vaddl_s16(a, b);
776 }
777 
778 // CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
779 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
780 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
781 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
782 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
783 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
784 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
785 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
786 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vaddl_s32(int32x2_t a,int32x2_t b)787 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
788   return vaddl_s32(a, b);
789 }
790 
791 // CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
792 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
793 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
794 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
795 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vaddl_u8(uint8x8_t a,uint8x8_t b)796 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
797   return vaddl_u8(a, b);
798 }
799 
800 // CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
801 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
802 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
803 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
804 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
805 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
806 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
807 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
808 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vaddl_u16(uint16x4_t a,uint16x4_t b)809 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
810   return vaddl_u16(a, b);
811 }
812 
813 // CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
814 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
815 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
816 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
817 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
818 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
819 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
820 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
821 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vaddl_u32(uint32x2_t a,uint32x2_t b)822 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
823   return vaddl_u32(a, b);
824 }
825 
826 
827 // CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
828 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
829 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
830 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vaddw_s8(int16x8_t a,int8x8_t b)831 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
832   return vaddw_s8(a, b);
833 }
834 
835 // CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
836 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
837 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
838 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
839 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
840 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vaddw_s16(int32x4_t a,int16x4_t b)841 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
842   return vaddw_s16(a, b);
843 }
844 
845 // CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
846 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
847 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
848 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
849 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
850 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vaddw_s32(int64x2_t a,int32x2_t b)851 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
852   return vaddw_s32(a, b);
853 }
854 
855 // CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
856 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
857 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
858 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vaddw_u8(uint16x8_t a,uint8x8_t b)859 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
860   return vaddw_u8(a, b);
861 }
862 
863 // CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
864 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
865 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
866 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
867 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
868 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vaddw_u16(uint32x4_t a,uint16x4_t b)869 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
870   return vaddw_u16(a, b);
871 }
872 
873 // CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
874 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
875 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
876 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
877 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
878 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vaddw_u32(uint64x2_t a,uint32x2_t b)879 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
880   return vaddw_u32(a, b);
881 }
882 
883 
884 // CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
885 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
886 // CHECK:   ret <8 x i8> [[AND_I]]
test_vand_s8(int8x8_t a,int8x8_t b)887 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
888   return vand_s8(a, b);
889 }
890 
891 // CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
892 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
893 // CHECK:   ret <4 x i16> [[AND_I]]
test_vand_s16(int16x4_t a,int16x4_t b)894 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
895   return vand_s16(a, b);
896 }
897 
898 // CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
899 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
900 // CHECK:   ret <2 x i32> [[AND_I]]
test_vand_s32(int32x2_t a,int32x2_t b)901 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
902   return vand_s32(a, b);
903 }
904 
905 // CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
906 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
907 // CHECK:   ret <1 x i64> [[AND_I]]
test_vand_s64(int64x1_t a,int64x1_t b)908 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
909   return vand_s64(a, b);
910 }
911 
912 // CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
913 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
914 // CHECK:   ret <8 x i8> [[AND_I]]
test_vand_u8(uint8x8_t a,uint8x8_t b)915 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
916   return vand_u8(a, b);
917 }
918 
919 // CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
920 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
921 // CHECK:   ret <4 x i16> [[AND_I]]
test_vand_u16(uint16x4_t a,uint16x4_t b)922 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
923   return vand_u16(a, b);
924 }
925 
926 // CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
927 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
928 // CHECK:   ret <2 x i32> [[AND_I]]
test_vand_u32(uint32x2_t a,uint32x2_t b)929 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
930   return vand_u32(a, b);
931 }
932 
933 // CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
934 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
935 // CHECK:   ret <1 x i64> [[AND_I]]
test_vand_u64(uint64x1_t a,uint64x1_t b)936 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
937   return vand_u64(a, b);
938 }
939 
940 // CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
941 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
942 // CHECK:   ret <16 x i8> [[AND_I]]
test_vandq_s8(int8x16_t a,int8x16_t b)943 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
944   return vandq_s8(a, b);
945 }
946 
947 // CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
948 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
949 // CHECK:   ret <8 x i16> [[AND_I]]
test_vandq_s16(int16x8_t a,int16x8_t b)950 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
951   return vandq_s16(a, b);
952 }
953 
954 // CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
955 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
956 // CHECK:   ret <4 x i32> [[AND_I]]
test_vandq_s32(int32x4_t a,int32x4_t b)957 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
958   return vandq_s32(a, b);
959 }
960 
961 // CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
962 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
963 // CHECK:   ret <2 x i64> [[AND_I]]
test_vandq_s64(int64x2_t a,int64x2_t b)964 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
965   return vandq_s64(a, b);
966 }
967 
968 // CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
969 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
970 // CHECK:   ret <16 x i8> [[AND_I]]
test_vandq_u8(uint8x16_t a,uint8x16_t b)971 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
972   return vandq_u8(a, b);
973 }
974 
975 // CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
976 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
977 // CHECK:   ret <8 x i16> [[AND_I]]
test_vandq_u16(uint16x8_t a,uint16x8_t b)978 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
979   return vandq_u16(a, b);
980 }
981 
982 // CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
983 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
984 // CHECK:   ret <4 x i32> [[AND_I]]
test_vandq_u32(uint32x4_t a,uint32x4_t b)985 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
986   return vandq_u32(a, b);
987 }
988 
989 // CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
990 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
991 // CHECK:   ret <2 x i64> [[AND_I]]
test_vandq_u64(uint64x2_t a,uint64x2_t b)992 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
993   return vandq_u64(a, b);
994 }
995 
996 
997 // CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
998 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
999 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
1000 // CHECK:   ret <8 x i8> [[AND_I]]
test_vbic_s8(int8x8_t a,int8x8_t b)1001 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
1002   return vbic_s8(a, b);
1003 }
1004 
1005 // CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1006 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
1007 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
1008 // CHECK:   ret <4 x i16> [[AND_I]]
test_vbic_s16(int16x4_t a,int16x4_t b)1009 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
1010   return vbic_s16(a, b);
1011 }
1012 
1013 // CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1014 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
1015 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
1016 // CHECK:   ret <2 x i32> [[AND_I]]
test_vbic_s32(int32x2_t a,int32x2_t b)1017 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
1018   return vbic_s32(a, b);
1019 }
1020 
1021 // CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
1022 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
1023 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
1024 // CHECK:   ret <1 x i64> [[AND_I]]
test_vbic_s64(int64x1_t a,int64x1_t b)1025 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
1026   return vbic_s64(a, b);
1027 }
1028 
1029 // CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1030 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1031 // CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
1032 // CHECK:   ret <8 x i8> [[AND_I]]
test_vbic_u8(uint8x8_t a,uint8x8_t b)1033 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
1034   return vbic_u8(a, b);
1035 }
1036 
1037 // CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1038 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
1039 // CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
1040 // CHECK:   ret <4 x i16> [[AND_I]]
test_vbic_u16(uint16x4_t a,uint16x4_t b)1041 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
1042   return vbic_u16(a, b);
1043 }
1044 
1045 // CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1046 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
1047 // CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
1048 // CHECK:   ret <2 x i32> [[AND_I]]
test_vbic_u32(uint32x2_t a,uint32x2_t b)1049 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
1050   return vbic_u32(a, b);
1051 }
1052 
1053 // CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
1054 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
1055 // CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
1056 // CHECK:   ret <1 x i64> [[AND_I]]
test_vbic_u64(uint64x1_t a,uint64x1_t b)1057 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
1058   return vbic_u64(a, b);
1059 }
1060 
1061 // CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1062 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1063 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
1064 // CHECK:   ret <16 x i8> [[AND_I]]
test_vbicq_s8(int8x16_t a,int8x16_t b)1065 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
1066   return vbicq_s8(a, b);
1067 }
1068 
1069 // CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1070 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1071 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
1072 // CHECK:   ret <8 x i16> [[AND_I]]
test_vbicq_s16(int16x8_t a,int16x8_t b)1073 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
1074   return vbicq_s16(a, b);
1075 }
1076 
1077 // CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1078 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
1079 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
1080 // CHECK:   ret <4 x i32> [[AND_I]]
test_vbicq_s32(int32x4_t a,int32x4_t b)1081 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
1082   return vbicq_s32(a, b);
1083 }
1084 
1085 // CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
1086 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
1087 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
1088 // CHECK:   ret <2 x i64> [[AND_I]]
test_vbicq_s64(int64x2_t a,int64x2_t b)1089 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
1090   return vbicq_s64(a, b);
1091 }
1092 
1093 // CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1094 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1095 // CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
1096 // CHECK:   ret <16 x i8> [[AND_I]]
test_vbicq_u8(uint8x16_t a,uint8x16_t b)1097 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
1098   return vbicq_u8(a, b);
1099 }
1100 
1101 // CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1102 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1103 // CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
1104 // CHECK:   ret <8 x i16> [[AND_I]]
test_vbicq_u16(uint16x8_t a,uint16x8_t b)1105 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
1106   return vbicq_u16(a, b);
1107 }
1108 
1109 // CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1110 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
1111 // CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
1112 // CHECK:   ret <4 x i32> [[AND_I]]
test_vbicq_u32(uint32x4_t a,uint32x4_t b)1113 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
1114   return vbicq_u32(a, b);
1115 }
1116 
1117 // CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
1118 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
1119 // CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
1120 // CHECK:   ret <2 x i64> [[AND_I]]
test_vbicq_u64(uint64x2_t a,uint64x2_t b)1121 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
1122   return vbicq_u64(a, b);
1123 }
1124 
1125 
1126 // CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
1127 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
1128 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
test_vbsl_s8(uint8x8_t a,int8x8_t b,int8x8_t c)1129 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
1130   return vbsl_s8(a, b, c);
1131 }
1132 
1133 // CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
1134 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1135 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1136 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1137 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1138 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1139 // CHECK:   ret <4 x i16> [[TMP3]]
test_vbsl_s16(uint16x4_t a,int16x4_t b,int16x4_t c)1140 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
1141   return vbsl_s16(a, b, c);
1142 }
1143 
1144 // CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
1145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1147 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1148 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1149 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1150 // CHECK:   ret <2 x i32> [[TMP3]]
test_vbsl_s32(uint32x2_t a,int32x2_t b,int32x2_t c)1151 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
1152   return vbsl_s32(a, b, c);
1153 }
1154 
1155 // CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
1156 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1157 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1158 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1159 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1160 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1161 // CHECK:   ret <1 x i64> [[TMP3]]
test_vbsl_s64(uint64x1_t a,int64x1_t b,int64x1_t c)1162 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
1163   return vbsl_s64(a, b, c);
1164 }
1165 
1166 // CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
1167 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
1168 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
test_vbsl_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)1169 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
1170   return vbsl_u8(a, b, c);
1171 }
1172 
1173 // CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
1174 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1175 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1176 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1177 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1178 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1179 // CHECK:   ret <4 x i16> [[TMP3]]
test_vbsl_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)1180 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
1181   return vbsl_u16(a, b, c);
1182 }
1183 
1184 // CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
1185 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1186 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1187 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1188 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1189 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1190 // CHECK:   ret <2 x i32> [[TMP3]]
test_vbsl_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)1191 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
1192   return vbsl_u32(a, b, c);
1193 }
1194 
1195 // CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
1196 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1197 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1198 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1199 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1200 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1201 // CHECK:   ret <1 x i64> [[TMP3]]
test_vbsl_u64(uint64x1_t a,uint64x1_t b,uint64x1_t c)1202 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
1203   return vbsl_u64(a, b, c);
1204 }
1205 
1206 // CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 {
1207 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1208 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1209 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
1210 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1211 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
1212 // CHECK:   ret <2 x float> [[TMP3]]
test_vbsl_f32(uint32x2_t a,float32x2_t b,float32x2_t c)1213 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
1214   return vbsl_f32(a, b, c);
1215 }
1216 
1217 // CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
1218 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
1219 // CHECK:   ret <8 x i8> [[VBSL_V_I]]
test_vbsl_p8(uint8x8_t a,poly8x8_t b,poly8x8_t c)1220 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
1221   return vbsl_p8(a, b, c);
1222 }
1223 
1224 // CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
1225 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1226 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1227 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1228 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1229 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1230 // CHECK:   ret <4 x i16> [[TMP3]]
test_vbsl_p16(uint16x4_t a,poly16x4_t b,poly16x4_t c)1231 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
1232   return vbsl_p16(a, b, c);
1233 }
1234 
1235 // CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
1236 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
1237 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
test_vbslq_s8(uint8x16_t a,int8x16_t b,int8x16_t c)1238 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
1239   return vbslq_s8(a, b, c);
1240 }
1241 
1242 // CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
1243 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1244 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1245 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1246 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1247 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1248 // CHECK:   ret <8 x i16> [[TMP3]]
test_vbslq_s16(uint16x8_t a,int16x8_t b,int16x8_t c)1249 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
1250   return vbslq_s16(a, b, c);
1251 }
1252 
1253 // CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
1254 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1255 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1256 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1257 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1258 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1259 // CHECK:   ret <4 x i32> [[TMP3]]
test_vbslq_s32(uint32x4_t a,int32x4_t b,int32x4_t c)1260 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
1261   return vbslq_s32(a, b, c);
1262 }
1263 
1264 // CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
1265 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1266 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1267 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1268 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1269 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1270 // CHECK:   ret <2 x i64> [[TMP3]]
test_vbslq_s64(uint64x2_t a,int64x2_t b,int64x2_t c)1271 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
1272   return vbslq_s64(a, b, c);
1273 }
1274 
1275 // CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
1276 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
1277 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
test_vbslq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)1278 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
1279   return vbslq_u8(a, b, c);
1280 }
1281 
1282 // CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
1283 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1284 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1285 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1286 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1287 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1288 // CHECK:   ret <8 x i16> [[TMP3]]
test_vbslq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t c)1289 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
1290   return vbslq_u16(a, b, c);
1291 }
1292 
1293 // CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
1294 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1295 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1296 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1297 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1298 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1299 // CHECK:   ret <4 x i32> [[TMP3]]
test_vbslq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t c)1300 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
1301   return vbslq_u32(a, b, c);
1302 }
1303 
1304 // CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
1305 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1306 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1307 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1308 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1309 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1310 // CHECK:   ret <2 x i64> [[TMP3]]
test_vbslq_u64(uint64x2_t a,uint64x2_t b,uint64x2_t c)1311 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
1312   return vbslq_u64(a, b, c);
1313 }
1314 
1315 // CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 {
1316 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1317 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1318 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
1319 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1320 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
1321 // CHECK:   ret <4 x float> [[TMP3]]
test_vbslq_f32(uint32x4_t a,float32x4_t b,float32x4_t c)1322 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
1323   return vbslq_f32(a, b, c);
1324 }
1325 
1326 // CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
1327 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
1328 // CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
test_vbslq_p8(uint8x16_t a,poly8x16_t b,poly8x16_t c)1329 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
1330   return vbslq_p8(a, b, c);
1331 }
1332 
1333 // CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
1334 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1335 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1336 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1337 // CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1338 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1339 // CHECK:   ret <8 x i16> [[TMP3]]
test_vbslq_p16(uint16x8_t a,poly16x8_t b,poly16x8_t c)1340 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
1341   return vbslq_p16(a, b, c);
1342 }
1343 
1344 
1345 // CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 {
1346 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1347 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1348 // CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1349 // CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1350 // CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
1351 // CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
test_vcage_f32(float32x2_t a,float32x2_t b)1352 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
1353   return vcage_f32(a, b);
1354 }
1355 
1356 // CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 {
1357 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1358 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1359 // CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1360 // CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1361 // CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
1362 // CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
test_vcageq_f32(float32x4_t a,float32x4_t b)1363 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
1364   return vcageq_f32(a, b);
1365 }
1366 
1367 
1368 // CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 {
1369 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1370 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1371 // CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1372 // CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1373 // CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
1374 // CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
test_vcagt_f32(float32x2_t a,float32x2_t b)1375 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
1376   return vcagt_f32(a, b);
1377 }
1378 
1379 // CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 {
1380 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1381 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1382 // CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1383 // CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1384 // CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
1385 // CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
test_vcagtq_f32(float32x4_t a,float32x4_t b)1386 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
1387   return vcagtq_f32(a, b);
1388 }
1389 
1390 
1391 // CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 {
1392 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1393 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1394 // CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1395 // CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1396 // CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
1397 // CHECK:   ret <2 x i32> [[VCALE_V2_I]]
test_vcale_f32(float32x2_t a,float32x2_t b)1398 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
1399   return vcale_f32(a, b);
1400 }
1401 
1402 // CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 {
1403 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1404 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1405 // CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1406 // CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1407 // CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
1408 // CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
test_vcaleq_f32(float32x4_t a,float32x4_t b)1409 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
1410   return vcaleq_f32(a, b);
1411 }
1412 
1413 
1414 // CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 {
1415 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1416 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1417 // CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1418 // CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1419 // CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
1420 // CHECK:   ret <2 x i32> [[VCALT_V2_I]]
test_vcalt_f32(float32x2_t a,float32x2_t b)1421 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
1422   return vcalt_f32(a, b);
1423 }
1424 
1425 // CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 {
1426 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1427 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1428 // CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1429 // CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1430 // CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
1431 // CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
test_vcaltq_f32(float32x4_t a,float32x4_t b)1432 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
1433   return vcaltq_f32(a, b);
1434 }
1435 
1436 
1437 // CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1438 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1439 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1440 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vceq_s8(int8x8_t a,int8x8_t b)1441 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
1442   return vceq_s8(a, b);
1443 }
1444 
1445 // CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1446 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1447 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1448 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vceq_s16(int16x4_t a,int16x4_t b)1449 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
1450   return vceq_s16(a, b);
1451 }
1452 
1453 // CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1454 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1455 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1456 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vceq_s32(int32x2_t a,int32x2_t b)1457 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
1458   return vceq_s32(a, b);
1459 }
1460 
1461 // CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 {
1462 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
1463 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1464 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vceq_f32(float32x2_t a,float32x2_t b)1465 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
1466   return vceq_f32(a, b);
1467 }
1468 
1469 // CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1470 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1471 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1472 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vceq_u8(uint8x8_t a,uint8x8_t b)1473 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
1474   return vceq_u8(a, b);
1475 }
1476 
1477 // CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1478 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1479 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1480 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vceq_u16(uint16x4_t a,uint16x4_t b)1481 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
1482   return vceq_u16(a, b);
1483 }
1484 
1485 // CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1486 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1487 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1488 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vceq_u32(uint32x2_t a,uint32x2_t b)1489 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
1490   return vceq_u32(a, b);
1491 }
1492 
1493 // CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 {
1494 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1495 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1496 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vceq_p8(poly8x8_t a,poly8x8_t b)1497 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
1498   return vceq_p8(a, b);
1499 }
1500 
1501 // CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1502 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1503 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1504 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vceqq_s8(int8x16_t a,int8x16_t b)1505 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
1506   return vceqq_s8(a, b);
1507 }
1508 
1509 // CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1510 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1511 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1512 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vceqq_s16(int16x8_t a,int16x8_t b)1513 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
1514   return vceqq_s16(a, b);
1515 }
1516 
1517 // CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1518 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1519 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1520 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vceqq_s32(int32x4_t a,int32x4_t b)1521 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
1522   return vceqq_s32(a, b);
1523 }
1524 
1525 // CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 {
1526 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
1527 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1528 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vceqq_f32(float32x4_t a,float32x4_t b)1529 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
1530   return vceqq_f32(a, b);
1531 }
1532 
1533 // CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1534 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1535 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1536 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vceqq_u8(uint8x16_t a,uint8x16_t b)1537 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
1538   return vceqq_u8(a, b);
1539 }
1540 
1541 // CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1542 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1543 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1544 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vceqq_u16(uint16x8_t a,uint16x8_t b)1545 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
1546   return vceqq_u16(a, b);
1547 }
1548 
1549 // CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1550 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1551 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1552 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vceqq_u32(uint32x4_t a,uint32x4_t b)1553 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
1554   return vceqq_u32(a, b);
1555 }
1556 
1557 // CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
1558 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1559 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1560 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vceqq_p8(poly8x16_t a,poly8x16_t b)1561 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
1562   return vceqq_p8(a, b);
1563 }
1564 
1565 
1566 // CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1567 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
1568 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1569 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vcge_s8(int8x8_t a,int8x8_t b)1570 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
1571   return vcge_s8(a, b);
1572 }
1573 
1574 // CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1575 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
1576 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1577 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vcge_s16(int16x4_t a,int16x4_t b)1578 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
1579   return vcge_s16(a, b);
1580 }
1581 
1582 // CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1583 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
1584 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1585 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcge_s32(int32x2_t a,int32x2_t b)1586 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
1587   return vcge_s32(a, b);
1588 }
1589 
1590 // CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 {
1591 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
1592 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1593 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcge_f32(float32x2_t a,float32x2_t b)1594 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
1595   return vcge_f32(a, b);
1596 }
1597 
1598 // CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1599 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
1600 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1601 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vcge_u8(uint8x8_t a,uint8x8_t b)1602 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
1603   return vcge_u8(a, b);
1604 }
1605 
1606 // CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1607 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
1608 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1609 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vcge_u16(uint16x4_t a,uint16x4_t b)1610 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
1611   return vcge_u16(a, b);
1612 }
1613 
1614 // CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1615 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
1616 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1617 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcge_u32(uint32x2_t a,uint32x2_t b)1618 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
1619   return vcge_u32(a, b);
1620 }
1621 
1622 // CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1623 // CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
1624 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1625 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcgeq_s8(int8x16_t a,int8x16_t b)1626 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
1627   return vcgeq_s8(a, b);
1628 }
1629 
1630 // CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1631 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
1632 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1633 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcgeq_s16(int16x8_t a,int16x8_t b)1634 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
1635   return vcgeq_s16(a, b);
1636 }
1637 
1638 // CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1639 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
1640 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1641 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcgeq_s32(int32x4_t a,int32x4_t b)1642 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
1643   return vcgeq_s32(a, b);
1644 }
1645 
1646 // CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 {
1647 // CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
1648 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1649 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcgeq_f32(float32x4_t a,float32x4_t b)1650 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
1651   return vcgeq_f32(a, b);
1652 }
1653 
1654 // CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1655 // CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
1656 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1657 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcgeq_u8(uint8x16_t a,uint8x16_t b)1658 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
1659   return vcgeq_u8(a, b);
1660 }
1661 
1662 // CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1663 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
1664 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1665 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcgeq_u16(uint16x8_t a,uint16x8_t b)1666 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
1667   return vcgeq_u16(a, b);
1668 }
1669 
1670 // CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1671 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
1672 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1673 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcgeq_u32(uint32x4_t a,uint32x4_t b)1674 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
1675   return vcgeq_u32(a, b);
1676 }
1677 
1678 
1679 // CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1680 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
1681 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1682 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vcgt_s8(int8x8_t a,int8x8_t b)1683 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
1684   return vcgt_s8(a, b);
1685 }
1686 
1687 // CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1688 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
1689 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1690 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vcgt_s16(int16x4_t a,int16x4_t b)1691 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
1692   return vcgt_s16(a, b);
1693 }
1694 
1695 // CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1696 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
1697 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1698 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcgt_s32(int32x2_t a,int32x2_t b)1699 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
1700   return vcgt_s32(a, b);
1701 }
1702 
1703 // CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 {
1704 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
1705 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1706 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcgt_f32(float32x2_t a,float32x2_t b)1707 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
1708   return vcgt_f32(a, b);
1709 }
1710 
1711 // CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1712 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
1713 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1714 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vcgt_u8(uint8x8_t a,uint8x8_t b)1715 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
1716   return vcgt_u8(a, b);
1717 }
1718 
1719 // CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1720 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
1721 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1722 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vcgt_u16(uint16x4_t a,uint16x4_t b)1723 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
1724   return vcgt_u16(a, b);
1725 }
1726 
1727 // CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1728 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
1729 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1730 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcgt_u32(uint32x2_t a,uint32x2_t b)1731 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
1732   return vcgt_u32(a, b);
1733 }
1734 
1735 // CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1736 // CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
1737 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1738 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcgtq_s8(int8x16_t a,int8x16_t b)1739 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
1740   return vcgtq_s8(a, b);
1741 }
1742 
1743 // CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1744 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
1745 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1746 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcgtq_s16(int16x8_t a,int16x8_t b)1747 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
1748   return vcgtq_s16(a, b);
1749 }
1750 
1751 // CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1752 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
1753 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1754 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcgtq_s32(int32x4_t a,int32x4_t b)1755 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
1756   return vcgtq_s32(a, b);
1757 }
1758 
1759 // CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 {
1760 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
1761 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1762 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcgtq_f32(float32x4_t a,float32x4_t b)1763 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
1764   return vcgtq_f32(a, b);
1765 }
1766 
1767 // CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1768 // CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
1769 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1770 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcgtq_u8(uint8x16_t a,uint8x16_t b)1771 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
1772   return vcgtq_u8(a, b);
1773 }
1774 
1775 // CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1776 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
1777 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1778 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcgtq_u16(uint16x8_t a,uint16x8_t b)1779 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
1780   return vcgtq_u16(a, b);
1781 }
1782 
1783 // CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1784 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
1785 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1786 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcgtq_u32(uint32x4_t a,uint32x4_t b)1787 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
1788   return vcgtq_u32(a, b);
1789 }
1790 
1791 
1792 // CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1793 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
1794 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1795 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vcle_s8(int8x8_t a,int8x8_t b)1796 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
1797   return vcle_s8(a, b);
1798 }
1799 
1800 // CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1801 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
1802 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1803 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vcle_s16(int16x4_t a,int16x4_t b)1804 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
1805   return vcle_s16(a, b);
1806 }
1807 
1808 // CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1809 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
1810 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1811 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcle_s32(int32x2_t a,int32x2_t b)1812 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
1813   return vcle_s32(a, b);
1814 }
1815 
1816 // CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 {
1817 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
1818 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1819 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcle_f32(float32x2_t a,float32x2_t b)1820 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
1821   return vcle_f32(a, b);
1822 }
1823 
1824 // CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1825 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
1826 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1827 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vcle_u8(uint8x8_t a,uint8x8_t b)1828 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
1829   return vcle_u8(a, b);
1830 }
1831 
1832 // CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1833 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
1834 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1835 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vcle_u16(uint16x4_t a,uint16x4_t b)1836 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
1837   return vcle_u16(a, b);
1838 }
1839 
1840 // CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1841 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
1842 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1843 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vcle_u32(uint32x2_t a,uint32x2_t b)1844 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
1845   return vcle_u32(a, b);
1846 }
1847 
1848 // CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1849 // CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
1850 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1851 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcleq_s8(int8x16_t a,int8x16_t b)1852 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
1853   return vcleq_s8(a, b);
1854 }
1855 
1856 // CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1857 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
1858 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1859 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcleq_s16(int16x8_t a,int16x8_t b)1860 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
1861   return vcleq_s16(a, b);
1862 }
1863 
1864 // CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1865 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
1866 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1867 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcleq_s32(int32x4_t a,int32x4_t b)1868 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
1869   return vcleq_s32(a, b);
1870 }
1871 
1872 // CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 {
1873 // CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
1874 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1875 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcleq_f32(float32x4_t a,float32x4_t b)1876 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
1877   return vcleq_f32(a, b);
1878 }
1879 
1880 // CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1881 // CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
1882 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1883 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcleq_u8(uint8x16_t a,uint8x16_t b)1884 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
1885   return vcleq_u8(a, b);
1886 }
1887 
1888 // CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1889 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
1890 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1891 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcleq_u16(uint16x8_t a,uint16x8_t b)1892 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
1893   return vcleq_u16(a, b);
1894 }
1895 
1896 // CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1897 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
1898 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1899 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcleq_u32(uint32x4_t a,uint32x4_t b)1900 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
1901   return vcleq_u32(a, b);
1902 }
1903 
1904 
1905 // CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
1906 // CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
1907 // CHECK:   ret <8 x i8> [[VCLS_V_I]]
test_vcls_s8(int8x8_t a)1908 int8x8_t test_vcls_s8(int8x8_t a) {
1909   return vcls_s8(a);
1910 }
1911 
1912 // CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
1913 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1914 // CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1915 // CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[VCLS_V_I]]) #4
1916 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
1917 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
1918 // CHECK:   ret <4 x i16> [[TMP1]]
test_vcls_s16(int16x4_t a)1919 int16x4_t test_vcls_s16(int16x4_t a) {
1920   return vcls_s16(a);
1921 }
1922 
1923 // CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
1924 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1925 // CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1926 // CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[VCLS_V_I]]) #4
1927 // CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
1928 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
1929 // CHECK:   ret <2 x i32> [[TMP1]]
test_vcls_s32(int32x2_t a)1930 int32x2_t test_vcls_s32(int32x2_t a) {
1931   return vcls_s32(a);
1932 }
1933 
1934 // CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
1935 // CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
1936 // CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
test_vclsq_s8(int8x16_t a)1937 int8x16_t test_vclsq_s8(int8x16_t a) {
1938   return vclsq_s8(a);
1939 }
1940 
1941 // CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
1942 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1943 // CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1944 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4
1945 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
1946 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
1947 // CHECK:   ret <8 x i16> [[TMP1]]
test_vclsq_s16(int16x8_t a)1948 int16x8_t test_vclsq_s16(int16x8_t a) {
1949   return vclsq_s16(a);
1950 }
1951 
1952 // CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
1953 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1954 // CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1955 // CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4
1956 // CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
1957 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
1958 // CHECK:   ret <4 x i32> [[TMP1]]
test_vclsq_s32(int32x4_t a)1959 int32x4_t test_vclsq_s32(int32x4_t a) {
1960   return vclsq_s32(a);
1961 }
1962 
1963 
1964 // CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1965 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
1966 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1967 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vclt_s8(int8x8_t a,int8x8_t b)1968 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
1969   return vclt_s8(a, b);
1970 }
1971 
1972 // CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1973 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
1974 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1975 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vclt_s16(int16x4_t a,int16x4_t b)1976 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
1977   return vclt_s16(a, b);
1978 }
1979 
1980 // CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1981 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
1982 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1983 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vclt_s32(int32x2_t a,int32x2_t b)1984 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
1985   return vclt_s32(a, b);
1986 }
1987 
1988 // CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 {
1989 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
1990 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1991 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vclt_f32(float32x2_t a,float32x2_t b)1992 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
1993   return vclt_f32(a, b);
1994 }
1995 
1996 // CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1997 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
1998 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1999 // CHECK:   ret <8 x i8> [[SEXT_I]]
test_vclt_u8(uint8x8_t a,uint8x8_t b)2000 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
2001   return vclt_u8(a, b);
2002 }
2003 
2004 // CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
2005 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
2006 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
2007 // CHECK:   ret <4 x i16> [[SEXT_I]]
test_vclt_u16(uint16x4_t a,uint16x4_t b)2008 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
2009   return vclt_u16(a, b);
2010 }
2011 
2012 // CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
2013 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
2014 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
2015 // CHECK:   ret <2 x i32> [[SEXT_I]]
test_vclt_u32(uint32x2_t a,uint32x2_t b)2016 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
2017   return vclt_u32(a, b);
2018 }
2019 
2020 // CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
2021 // CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
2022 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
2023 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcltq_s8(int8x16_t a,int8x16_t b)2024 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
2025   return vcltq_s8(a, b);
2026 }
2027 
2028 // CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
2029 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
2030 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
2031 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcltq_s16(int16x8_t a,int16x8_t b)2032 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
2033   return vcltq_s16(a, b);
2034 }
2035 
2036 // CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
2037 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
2038 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
2039 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcltq_s32(int32x4_t a,int32x4_t b)2040 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
2041   return vcltq_s32(a, b);
2042 }
2043 
2044 // CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 {
2045 // CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
2046 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
2047 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcltq_f32(float32x4_t a,float32x4_t b)2048 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
2049   return vcltq_f32(a, b);
2050 }
2051 
2052 // CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
2053 // CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
2054 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
2055 // CHECK:   ret <16 x i8> [[SEXT_I]]
test_vcltq_u8(uint8x16_t a,uint8x16_t b)2056 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
2057   return vcltq_u8(a, b);
2058 }
2059 
2060 // CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
2061 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
2062 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
2063 // CHECK:   ret <8 x i16> [[SEXT_I]]
test_vcltq_u16(uint16x8_t a,uint16x8_t b)2064 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
2065   return vcltq_u16(a, b);
2066 }
2067 
2068 // CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
2069 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
2070 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
2071 // CHECK:   ret <4 x i32> [[SEXT_I]]
test_vcltq_u32(uint32x4_t a,uint32x4_t b)2072 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
2073   return vcltq_u32(a, b);
2074 }
2075 
2076 
2077 // CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
2078 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
2079 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
test_vclz_s8(int8x8_t a)2080 int8x8_t test_vclz_s8(int8x8_t a) {
2081   return vclz_s8(a);
2082 }
2083 
2084 // CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
2085 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2086 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2087 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2088 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2089 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2090 // CHECK:   ret <4 x i16> [[TMP1]]
test_vclz_s16(int16x4_t a)2091 int16x4_t test_vclz_s16(int16x4_t a) {
2092   return vclz_s16(a);
2093 }
2094 
2095 // CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
2096 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2097 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2098 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2099 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2100 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2101 // CHECK:   ret <2 x i32> [[TMP1]]
test_vclz_s32(int32x2_t a)2102 int32x2_t test_vclz_s32(int32x2_t a) {
2103   return vclz_s32(a);
2104 }
2105 
2106 // CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
2107 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
2108 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
test_vclz_u8(uint8x8_t a)2109 uint8x8_t test_vclz_u8(uint8x8_t a) {
2110   return vclz_u8(a);
2111 }
2112 
2113 // CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
2114 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2115 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2116 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2117 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2118 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2119 // CHECK:   ret <4 x i16> [[TMP1]]
test_vclz_u16(uint16x4_t a)2120 uint16x4_t test_vclz_u16(uint16x4_t a) {
2121   return vclz_u16(a);
2122 }
2123 
2124 // CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
2125 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2126 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2127 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2128 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2129 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2130 // CHECK:   ret <2 x i32> [[TMP1]]
test_vclz_u32(uint32x2_t a)2131 uint32x2_t test_vclz_u32(uint32x2_t a) {
2132   return vclz_u32(a);
2133 }
2134 
2135 // CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
2136 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
2137 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
test_vclzq_s8(int8x16_t a)2138 int8x16_t test_vclzq_s8(int8x16_t a) {
2139   return vclzq_s8(a);
2140 }
2141 
2142 // CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
2143 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2144 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2145 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
2146 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
2147 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
2148 // CHECK:   ret <8 x i16> [[TMP1]]
test_vclzq_s16(int16x8_t a)2149 int16x8_t test_vclzq_s16(int16x8_t a) {
2150   return vclzq_s16(a);
2151 }
2152 
2153 // CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
2154 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2155 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2156 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
2157 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
2158 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
2159 // CHECK:   ret <4 x i32> [[TMP1]]
test_vclzq_s32(int32x4_t a)2160 int32x4_t test_vclzq_s32(int32x4_t a) {
2161   return vclzq_s32(a);
2162 }
2163 
2164 // CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
2165 // CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
2166 // CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
test_vclzq_u8(uint8x16_t a)2167 uint8x16_t test_vclzq_u8(uint8x16_t a) {
2168   return vclzq_u8(a);
2169 }
2170 
2171 // CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
2172 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2173 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2174 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
2175 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
2176 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
2177 // CHECK:   ret <8 x i16> [[TMP1]]
test_vclzq_u16(uint16x8_t a)2178 uint16x8_t test_vclzq_u16(uint16x8_t a) {
2179   return vclzq_u16(a);
2180 }
2181 
2182 // CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
2183 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2184 // CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2185 // CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
2186 // CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
2187 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
2188 // CHECK:   ret <4 x i32> [[TMP1]]
test_vclzq_u32(uint32x4_t a)2189 uint32x4_t test_vclzq_u32(uint32x4_t a) {
2190   return vclzq_u32(a);
2191 }
2192 
2193 
2194 // CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
2195 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
2196 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
test_vcnt_u8(uint8x8_t a)2197 uint8x8_t test_vcnt_u8(uint8x8_t a) {
2198   return vcnt_u8(a);
2199 }
2200 
2201 // CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
2202 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
2203 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
test_vcnt_s8(int8x8_t a)2204 int8x8_t test_vcnt_s8(int8x8_t a) {
2205   return vcnt_s8(a);
2206 }
2207 
2208 // CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
2209 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
2210 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
test_vcnt_p8(poly8x8_t a)2211 poly8x8_t test_vcnt_p8(poly8x8_t a) {
2212   return vcnt_p8(a);
2213 }
2214 
2215 // CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
2216 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
2217 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
test_vcntq_u8(uint8x16_t a)2218 uint8x16_t test_vcntq_u8(uint8x16_t a) {
2219   return vcntq_u8(a);
2220 }
2221 
2222 // CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
2223 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
2224 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
test_vcntq_s8(int8x16_t a)2225 int8x16_t test_vcntq_s8(int8x16_t a) {
2226   return vcntq_s8(a);
2227 }
2228 
2229 // CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
2230 // CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
2231 // CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
test_vcntq_p8(poly8x16_t a)2232 poly8x16_t test_vcntq_p8(poly8x16_t a) {
2233   return vcntq_p8(a);
2234 }
2235 
2236 
2237 // CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 {
2238 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2239 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vcombine_s8(int8x8_t a,int8x8_t b)2240 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
2241   return vcombine_s8(a, b);
2242 }
2243 
2244 // CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 {
2245 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2246 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vcombine_s16(int16x4_t a,int16x4_t b)2247 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
2248   return vcombine_s16(a, b);
2249 }
2250 
2251 // CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 {
2252 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2253 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
test_vcombine_s32(int32x2_t a,int32x2_t b)2254 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
2255   return vcombine_s32(a, b);
2256 }
2257 
2258 // CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 {
2259 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2260 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vcombine_s64(int64x1_t a,int64x1_t b)2261 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
2262   return vcombine_s64(a, b);
2263 }
2264 
2265 // CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 {
2266 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2267 // CHECK:   ret <8 x half> [[SHUFFLE_I]]
test_vcombine_f16(float16x4_t a,float16x4_t b)2268 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
2269   return vcombine_f16(a, b);
2270 }
2271 
2272 // CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 {
2273 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2274 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
test_vcombine_f32(float32x2_t a,float32x2_t b)2275 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
2276   return vcombine_f32(a, b);
2277 }
2278 
2279 // CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 {
2280 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2281 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vcombine_u8(uint8x8_t a,uint8x8_t b)2282 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
2283   return vcombine_u8(a, b);
2284 }
2285 
2286 // CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 {
2287 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2288 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vcombine_u16(uint16x4_t a,uint16x4_t b)2289 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
2290   return vcombine_u16(a, b);
2291 }
2292 
2293 // CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 {
2294 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2295 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
test_vcombine_u32(uint32x2_t a,uint32x2_t b)2296 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
2297   return vcombine_u32(a, b);
2298 }
2299 
2300 // CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 {
2301 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2302 // CHECK:   ret <2 x i64> [[SHUFFLE_I]]
test_vcombine_u64(uint64x1_t a,uint64x1_t b)2303 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
2304   return vcombine_u64(a, b);
2305 }
2306 
2307 // CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 {
2308 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2309 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vcombine_p8(poly8x8_t a,poly8x8_t b)2310 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
2311   return vcombine_p8(a, b);
2312 }
2313 
2314 // CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 {
2315 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2316 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vcombine_p16(poly16x4_t a,poly16x4_t b)2317 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
2318   return vcombine_p16(a, b);
2319 }
2320 
2321 
2322 // CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 {
2323 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2324 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
2325 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
test_vcreate_s8(uint64_t a)2326 int8x8_t test_vcreate_s8(uint64_t a) {
2327   return vclz_s8(vcreate_s8(a));
2328 }
2329 
2330 // CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 {
2331 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2332 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2333 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2334 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2335 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2336 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2337 // CHECK:   ret <4 x i16> [[TMP2]]
test_vcreate_s16(uint64_t a)2338 int16x4_t test_vcreate_s16(uint64_t a) {
2339   return vclz_s16(vcreate_s16(a));
2340 }
2341 
2342 // CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 {
2343 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2344 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2345 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2346 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2347 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2348 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2349 // CHECK:   ret <2 x i32> [[TMP2]]
test_vcreate_s32(uint64_t a)2350 int32x2_t test_vcreate_s32(uint64_t a) {
2351   return vclz_s32(vcreate_s32(a));
2352 }
2353 
2354 // CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 {
2355 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
2356 // CHECK:   ret <4 x half> [[TMP0]]
test_vcreate_f16(uint64_t a)2357 float16x4_t test_vcreate_f16(uint64_t a) {
2358   return vcreate_f16(a);
2359 }
2360 
2361 // CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 {
2362 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
2363 // CHECK:   ret <2 x float> [[TMP0]]
test_vcreate_f32(uint64_t a)2364 float32x2_t test_vcreate_f32(uint64_t a) {
2365   return vcreate_f32(a);
2366 }
2367 
2368 // CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 {
2369 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2370 // CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
2371 // CHECK:   ret <8 x i8> [[VCLZ_V_I]]
test_vcreate_u8(uint64_t a)2372 uint8x8_t test_vcreate_u8(uint64_t a) {
2373   return vclz_s8(vcreate_u8(a));
2374 }
2375 
2376 // CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 {
2377 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2378 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2379 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2380 // CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2381 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2382 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2383 // CHECK:   ret <4 x i16> [[TMP2]]
test_vcreate_u16(uint64_t a)2384 uint16x4_t test_vcreate_u16(uint64_t a) {
2385   return vclz_s16(vcreate_u16(a));
2386 }
2387 
2388 // CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 {
2389 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2390 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2391 // CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2392 // CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2393 // CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2394 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2395 // CHECK:   ret <2 x i32> [[TMP2]]
test_vcreate_u32(uint64_t a)2396 uint32x2_t test_vcreate_u32(uint64_t a) {
2397   return vclz_s32(vcreate_u32(a));
2398 }
2399 
2400 
2401 // We have two ways of lowering that.  Either with one 'vmov d, r, r' or
2402 // with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
2403 // strict about the matching pattern if it starts causing problem.
2404 // CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 {
2405 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2406 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2407 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vcreate_u64(uint64_t a)2408 uint64x1_t test_vcreate_u64(uint64_t a) {
2409   uint64x1_t tmp = vcreate_u64(a);
2410   return vadd_u64(tmp, tmp);
2411 
2412 }
2413 
2414 // CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 {
2415 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2416 // CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
2417 // CHECK:   ret <8 x i8> [[VCNT_V_I]]
test_vcreate_p8(uint64_t a)2418 poly8x8_t test_vcreate_p8(uint64_t a) {
2419   return vcnt_p8(vcreate_p8(a));
2420 }
2421 
2422 // CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 {
2423 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2424 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2425 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2426 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2427 // CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) #4
2428 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
2429 // CHECK:   ret <4 x i16> [[TMP4]]
test_vcreate_p16(uint64_t a)2430 poly16x4_t test_vcreate_p16(uint64_t a) {
2431   poly16x4_t tmp = vcreate_p16(a);
2432   return vbsl_p16(tmp, tmp, tmp);
2433 }
2434 
2435 // CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 {
2436 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2437 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2438 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vcreate_s64(uint64_t a)2439 int64x1_t test_vcreate_s64(uint64_t a) {
2440   int64x1_t tmp = vcreate_s64(a);
2441   return vadd_s64(tmp, tmp);
2442 }
2443 
2444 
2445 // CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
2446 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2447 // CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2448 // CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4
2449 // CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
2450 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
2451 // CHECK:   ret <4 x half> [[TMP1]]
test_vcvt_f16_f32(float32x4_t a)2452 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
2453   return vcvt_f16_f32(a);
2454 }
2455 
2456 
2457 // CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
2458 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2459 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2460 // CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
2461 // CHECK:   ret <2 x float> [[VCVT_I]]
test_vcvt_f32_s32(int32x2_t a)2462 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
2463   return vcvt_f32_s32(a);
2464 }
2465 
2466 // CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
2467 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2468 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2469 // CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
2470 // CHECK:   ret <2 x float> [[VCVT_I]]
test_vcvt_f32_u32(uint32x2_t a)2471 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
2472   return vcvt_f32_u32(a);
2473 }
2474 
2475 // CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
2476 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2477 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2478 // CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
2479 // CHECK:   ret <4 x float> [[VCVT_I]]
test_vcvtq_f32_s32(int32x4_t a)2480 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
2481   return vcvtq_f32_s32(a);
2482 }
2483 
2484 // CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
2485 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2486 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2487 // CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
2488 // CHECK:   ret <4 x float> [[VCVT_I]]
test_vcvtq_f32_u32(uint32x4_t a)2489 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
2490   return vcvtq_f32_u32(a);
2491 }
2492 
2493 
2494 // CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
2495 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
2496 // CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2497 // CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
2498 // CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
2499 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
2500 // CHECK:   ret <4 x float> [[TMP1]]
test_vcvt_f32_f16(float16x4_t a)2501 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
2502   return vcvt_f32_f16(a);
2503 }
2504 
2505 
2506 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
2507 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2508 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2509 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2510 // CHECK:   ret <2 x float> [[VCVT_N1]]
test_vcvt_n_f32_s32(int32x2_t a)2511 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
2512   return vcvt_n_f32_s32(a, 1);
2513 }
2514 
2515 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
2516 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2517 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2518 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2519 // CHECK:   ret <2 x float> [[VCVT_N1]]
test_vcvt_n_f32_u32(uint32x2_t a)2520 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
2521   return vcvt_n_f32_u32(a, 1);
2522 }
2523 
2524 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
2525 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2526 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2527 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2528 // CHECK:   ret <4 x float> [[VCVT_N1]]
test_vcvtq_n_f32_s32(int32x4_t a)2529 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
2530   return vcvtq_n_f32_s32(a, 3);
2531 }
2532 
2533 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
2534 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2535 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2536 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2537 // CHECK:   ret <4 x float> [[VCVT_N1]]
test_vcvtq_n_f32_u32(uint32x4_t a)2538 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
2539   return vcvtq_n_f32_u32(a, 3);
2540 }
2541 
2542 
2543 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
2544 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2545 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2546 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2547 // CHECK:   ret <2 x i32> [[VCVT_N1]]
test_vcvt_n_s32_f32(float32x2_t a)2548 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
2549   return vcvt_n_s32_f32(a, 1);
2550 }
2551 
2552 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
2553 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2554 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2555 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2556 // CHECK:   ret <4 x i32> [[VCVT_N1]]
test_vcvtq_n_s32_f32(float32x4_t a)2557 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
2558   return vcvtq_n_s32_f32(a, 3);
2559 }
2560 
2561 
2562 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
2563 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2564 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2565 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2566 // CHECK:   ret <2 x i32> [[VCVT_N1]]
test_vcvt_n_u32_f32(float32x2_t a)2567 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
2568   return vcvt_n_u32_f32(a, 1);
2569 }
2570 
2571 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
2572 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2573 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2574 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2575 // CHECK:   ret <4 x i32> [[VCVT_N1]]
test_vcvtq_n_u32_f32(float32x4_t a)2576 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
2577   return vcvtq_n_u32_f32(a, 3);
2578 }
2579 
2580 
2581 // CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
2582 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2583 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2584 // CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
2585 // CHECK:   ret <2 x i32> [[VCVT_I]]
test_vcvt_s32_f32(float32x2_t a)2586 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
2587   return vcvt_s32_f32(a);
2588 }
2589 
2590 // CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
2591 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2592 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2593 // CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
2594 // CHECK:   ret <4 x i32> [[VCVT_I]]
test_vcvtq_s32_f32(float32x4_t a)2595 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
2596   return vcvtq_s32_f32(a);
2597 }
2598 
2599 
2600 // CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
2601 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2602 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2603 // CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
2604 // CHECK:   ret <2 x i32> [[VCVT_I]]
test_vcvt_u32_f32(float32x2_t a)2605 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
2606   return vcvt_u32_f32(a);
2607 }
2608 
2609 // CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
2610 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2611 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2612 // CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
2613 // CHECK:   ret <4 x i32> [[VCVT_I]]
test_vcvtq_u32_f32(float32x4_t a)2614 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
2615   return vcvtq_u32_f32(a);
2616 }
2617 
2618 
2619 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 {
2620 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2621 // CHECK:   ret <8 x i8> [[SHUFFLE]]
test_vdup_lane_u8(uint8x8_t a)2622 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
2623   return vdup_lane_u8(a, 7);
2624 }
2625 
2626 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 {
2627 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2628 // CHECK:   ret <4 x i16> [[SHUFFLE]]
test_vdup_lane_u16(uint16x4_t a)2629 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
2630   return vdup_lane_u16(a, 3);
2631 }
2632 
2633 // CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 {
2634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
2635 // CHECK:   ret <2 x i32> [[SHUFFLE]]
test_vdup_lane_u32(uint32x2_t a)2636 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
2637   return vdup_lane_u32(a, 1);
2638 }
2639 
2640 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 {
2641 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2642 // CHECK:   ret <8 x i8> [[SHUFFLE]]
test_vdup_lane_s8(int8x8_t a)2643 int8x8_t test_vdup_lane_s8(int8x8_t a) {
2644   return vdup_lane_s8(a, 7);
2645 }
2646 
2647 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 {
2648 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2649 // CHECK:   ret <4 x i16> [[SHUFFLE]]
test_vdup_lane_s16(int16x4_t a)2650 int16x4_t test_vdup_lane_s16(int16x4_t a) {
2651   return vdup_lane_s16(a, 3);
2652 }
2653 
2654 // CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 {
2655 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
2656 // CHECK:   ret <2 x i32> [[SHUFFLE]]
test_vdup_lane_s32(int32x2_t a)2657 int32x2_t test_vdup_lane_s32(int32x2_t a) {
2658   return vdup_lane_s32(a, 1);
2659 }
2660 
2661 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 {
2662 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2663 // CHECK:   ret <8 x i8> [[SHUFFLE]]
test_vdup_lane_p8(poly8x8_t a)2664 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
2665   return vdup_lane_p8(a, 7);
2666 }
2667 
2668 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 {
2669 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2670 // CHECK:   ret <4 x i16> [[SHUFFLE]]
test_vdup_lane_p16(poly16x4_t a)2671 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
2672   return vdup_lane_p16(a, 3);
2673 }
2674 
2675 // CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 {
2676 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
2677 // CHECK:   ret <2 x float> [[SHUFFLE]]
test_vdup_lane_f32(float32x2_t a)2678 float32x2_t test_vdup_lane_f32(float32x2_t a) {
2679   return vdup_lane_f32(a, 1);
2680 }
2681 
2682 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 {
2683 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2684 // CHECK:   ret <16 x i8> [[SHUFFLE]]
test_vdupq_lane_u8(uint8x8_t a)2685 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
2686   return vdupq_lane_u8(a, 7);
2687 }
2688 
2689 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 {
2690 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2691 // CHECK:   ret <8 x i16> [[SHUFFLE]]
test_vdupq_lane_u16(uint16x4_t a)2692 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
2693   return vdupq_lane_u16(a, 3);
2694 }
2695 
2696 // CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 {
2697 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2698 // CHECK:   ret <4 x i32> [[SHUFFLE]]
test_vdupq_lane_u32(uint32x2_t a)2699 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
2700   return vdupq_lane_u32(a, 1);
2701 }
2702 
2703 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 {
2704 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2705 // CHECK:   ret <16 x i8> [[SHUFFLE]]
test_vdupq_lane_s8(int8x8_t a)2706 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
2707   return vdupq_lane_s8(a, 7);
2708 }
2709 
2710 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 {
2711 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2712 // CHECK:   ret <8 x i16> [[SHUFFLE]]
test_vdupq_lane_s16(int16x4_t a)2713 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
2714   return vdupq_lane_s16(a, 3);
2715 }
2716 
2717 // CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 {
2718 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2719 // CHECK:   ret <4 x i32> [[SHUFFLE]]
test_vdupq_lane_s32(int32x2_t a)2720 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
2721   return vdupq_lane_s32(a, 1);
2722 }
2723 
2724 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 {
2725 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2726 // CHECK:   ret <16 x i8> [[SHUFFLE]]
test_vdupq_lane_p8(poly8x8_t a)2727 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
2728   return vdupq_lane_p8(a, 7);
2729 }
2730 
2731 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 {
2732 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2733 // CHECK:   ret <8 x i16> [[SHUFFLE]]
test_vdupq_lane_p16(poly16x4_t a)2734 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
2735   return vdupq_lane_p16(a, 3);
2736 }
2737 
2738 // CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 {
2739 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2740 // CHECK:   ret <4 x float> [[SHUFFLE]]
test_vdupq_lane_f32(float32x2_t a)2741 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
2742   return vdupq_lane_f32(a, 1);
2743 }
2744 
2745 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 {
2746 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
2747 // CHECK:   ret <1 x i64> [[SHUFFLE]]
test_vdup_lane_s64(int64x1_t a)2748 int64x1_t test_vdup_lane_s64(int64x1_t a) {
2749   return vdup_lane_s64(a, 0);
2750 }
2751 
2752 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 {
2753 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
2754 // CHECK:   ret <1 x i64> [[SHUFFLE]]
test_vdup_lane_u64(uint64x1_t a)2755 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
2756   return vdup_lane_u64(a, 0);
2757 }
2758 
2759 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 {
2760 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
2761 // CHECK:   ret <2 x i64> [[SHUFFLE]]
test_vdupq_lane_s64(int64x1_t a)2762 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
2763   return vdupq_lane_s64(a, 0);
2764 }
2765 
2766 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 {
2767 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
2768 // CHECK:   ret <2 x i64> [[SHUFFLE]]
test_vdupq_lane_u64(uint64x1_t a)2769 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
2770   return vdupq_lane_u64(a, 0);
2771 }
2772 
2773 
2774 // CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 zeroext %a) #0 {
2775 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2776 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2777 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2778 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2779 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2780 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2781 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2782 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2783 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
test_vdup_n_u8(uint8_t a)2784 uint8x8_t test_vdup_n_u8(uint8_t a) {
2785   return vdup_n_u8(a);
2786 }
2787 
2788 // CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 zeroext %a) #0 {
2789 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2790 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2791 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2792 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2793 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
test_vdup_n_u16(uint16_t a)2794 uint16x4_t test_vdup_n_u16(uint16_t a) {
2795   return vdup_n_u16(a);
2796 }
2797 
2798 // CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 {
2799 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2800 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2801 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
test_vdup_n_u32(uint32_t a)2802 uint32x2_t test_vdup_n_u32(uint32_t a) {
2803   return vdup_n_u32(a);
2804 }
2805 
2806 // CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 signext %a) #0 {
2807 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2808 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2809 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2810 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2811 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2812 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2813 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2814 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2815 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
test_vdup_n_s8(int8_t a)2816 int8x8_t test_vdup_n_s8(int8_t a) {
2817   return vdup_n_s8(a);
2818 }
2819 
2820 // CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 signext %a) #0 {
2821 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2822 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2823 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2824 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2825 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
test_vdup_n_s16(int16_t a)2826 int16x4_t test_vdup_n_s16(int16_t a) {
2827   return vdup_n_s16(a);
2828 }
2829 
2830 // CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 {
2831 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2832 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2833 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
test_vdup_n_s32(int32_t a)2834 int32x2_t test_vdup_n_s32(int32_t a) {
2835   return vdup_n_s32(a);
2836 }
2837 
2838 // CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 signext %a) #0 {
2839 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2840 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2841 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2842 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2843 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2844 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2845 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2846 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2847 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
test_vdup_n_p8(poly8_t a)2848 poly8x8_t test_vdup_n_p8(poly8_t a) {
2849   return vdup_n_p8(a);
2850 }
2851 
2852 // CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 signext %a) #0 {
2853 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2854 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2855 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2856 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2857 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
test_vdup_n_p16(poly16_t a)2858 poly16x4_t test_vdup_n_p16(poly16_t a) {
2859   return vdup_n_p16(a);
2860 }
2861 
2862 // CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 {
2863 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
2864 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
2865 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
2866 // CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
2867 // CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
2868 // CHECK:   ret <4 x half> [[VECINIT3]]
test_vdup_n_f16(float16_t * a)2869 float16x4_t test_vdup_n_f16(float16_t *a) {
2870   return vdup_n_f16(*a);
2871 }
2872 
2873 // CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 {
2874 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
2875 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
2876 // CHECK:   ret <2 x float> [[VECINIT1_I]]
test_vdup_n_f32(float32_t a)2877 float32x2_t test_vdup_n_f32(float32_t a) {
2878   return vdup_n_f32(a);
2879 }
2880 
2881 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 zeroext %a) #0 {
2882 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2883 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2884 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2885 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2886 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2887 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2888 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2889 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2890 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2891 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2892 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2893 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2894 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2895 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2896 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2897 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2898 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
test_vdupq_n_u8(uint8_t a)2899 uint8x16_t test_vdupq_n_u8(uint8_t a) {
2900   return vdupq_n_u8(a);
2901 }
2902 
2903 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 zeroext %a) #0 {
2904 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2905 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2906 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2907 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2908 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2909 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2910 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2911 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2912 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
test_vdupq_n_u16(uint16_t a)2913 uint16x8_t test_vdupq_n_u16(uint16_t a) {
2914   return vdupq_n_u16(a);
2915 }
2916 
2917 // CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 {
2918 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2919 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2920 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2921 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2922 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
test_vdupq_n_u32(uint32_t a)2923 uint32x4_t test_vdupq_n_u32(uint32_t a) {
2924   return vdupq_n_u32(a);
2925 }
2926 
2927 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 signext %a) #0 {
2928 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2929 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2930 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2931 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2932 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2933 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2934 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2935 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2936 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2937 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2938 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2939 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2940 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2941 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2942 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2943 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2944 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
test_vdupq_n_s8(int8_t a)2945 int8x16_t test_vdupq_n_s8(int8_t a) {
2946   return vdupq_n_s8(a);
2947 }
2948 
2949 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 signext %a) #0 {
2950 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2951 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2952 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2953 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2954 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2955 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2956 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2957 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2958 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
test_vdupq_n_s16(int16_t a)2959 int16x8_t test_vdupq_n_s16(int16_t a) {
2960   return vdupq_n_s16(a);
2961 }
2962 
2963 // CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 {
2964 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2965 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2966 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2967 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2968 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
test_vdupq_n_s32(int32_t a)2969 int32x4_t test_vdupq_n_s32(int32_t a) {
2970   return vdupq_n_s32(a);
2971 }
2972 
2973 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 signext %a) #0 {
2974 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2975 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2976 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2977 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2978 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2979 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2980 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2981 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2982 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2983 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2984 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2985 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2986 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2987 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2988 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2989 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2990 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
test_vdupq_n_p8(poly8_t a)2991 poly8x16_t test_vdupq_n_p8(poly8_t a) {
2992   return vdupq_n_p8(a);
2993 }
2994 
2995 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 signext %a) #0 {
2996 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2997 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2998 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2999 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
3000 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
3001 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
3002 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
3003 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
3004 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
test_vdupq_n_p16(poly16_t a)3005 poly16x8_t test_vdupq_n_p16(poly16_t a) {
3006   return vdupq_n_p16(a);
3007 }
3008 
3009 // CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 {
3010 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
3011 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
3012 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
3013 // CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
3014 // CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
3015 // CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
3016 // CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
3017 // CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
3018 // CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
3019 // CHECK:   ret <8 x half> [[VECINIT7]]
test_vdupq_n_f16(float16_t * a)3020 float16x8_t test_vdupq_n_f16(float16_t *a) {
3021   return vdupq_n_f16(*a);
3022 }
3023 
3024 // CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 {
3025 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
3026 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
3027 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
3028 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
3029 // CHECK:   ret <4 x float> [[VECINIT3_I]]
test_vdupq_n_f32(float32_t a)3030 float32x4_t test_vdupq_n_f32(float32_t a) {
3031   return vdupq_n_f32(a);
3032 }
3033 
3034 // CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 {
3035 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
3036 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
3037 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vdup_n_s64(int64_t a)3038 int64x1_t test_vdup_n_s64(int64_t a) {
3039   int64x1_t tmp = vdup_n_s64(a);
3040   return vadd_s64(tmp, tmp);
3041 }
3042 
3043 // CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 {
3044 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
3045 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
3046 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vdup_n_u64(uint64_t a)3047 uint64x1_t test_vdup_n_u64(uint64_t a) {
3048   int64x1_t tmp = vdup_n_u64(a);
3049   return vadd_s64(tmp, tmp);
3050 
3051 }
3052 
3053 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 {
3054 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
3055 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
3056 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
3057 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vdupq_n_s64(int64_t a)3058 int64x2_t test_vdupq_n_s64(int64_t a) {
3059   int64x2_t tmp = vdupq_n_s64(a);
3060   return vaddq_s64(tmp, tmp);
3061 }
3062 
3063 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 {
3064 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
3065 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
3066 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
3067 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vdupq_n_u64(uint64_t a)3068 uint64x2_t test_vdupq_n_u64(uint64_t a) {
3069   int64x2_t tmp = vdupq_n_u64(a);
3070   return vaddq_u64(tmp, tmp);
3071 }
3072 
3073 
3074 // CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3075 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
3076 // CHECK:   ret <8 x i8> [[XOR_I]]
test_veor_s8(int8x8_t a,int8x8_t b)3077 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
3078   return veor_s8(a, b);
3079 }
3080 
3081 // CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3082 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
3083 // CHECK:   ret <4 x i16> [[XOR_I]]
test_veor_s16(int16x4_t a,int16x4_t b)3084 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
3085   return veor_s16(a, b);
3086 }
3087 
3088 // CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
3089 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
3090 // CHECK:   ret <2 x i32> [[XOR_I]]
test_veor_s32(int32x2_t a,int32x2_t b)3091 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
3092   return veor_s32(a, b);
3093 }
3094 
3095 // CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
3096 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
3097 // CHECK:   ret <1 x i64> [[XOR_I]]
test_veor_s64(int64x1_t a,int64x1_t b)3098 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
3099   return veor_s64(a, b);
3100 }
3101 
3102 // CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
3103 // CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
3104 // CHECK:   ret <8 x i8> [[XOR_I]]
test_veor_u8(uint8x8_t a,uint8x8_t b)3105 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
3106   return veor_u8(a, b);
3107 }
3108 
3109 // CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
3110 // CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
3111 // CHECK:   ret <4 x i16> [[XOR_I]]
test_veor_u16(uint16x4_t a,uint16x4_t b)3112 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
3113   return veor_u16(a, b);
3114 }
3115 
3116 // CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
3117 // CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
3118 // CHECK:   ret <2 x i32> [[XOR_I]]
test_veor_u32(uint32x2_t a,uint32x2_t b)3119 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
3120   return veor_u32(a, b);
3121 }
3122 
3123 // CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
3124 // CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
3125 // CHECK:   ret <1 x i64> [[XOR_I]]
test_veor_u64(uint64x1_t a,uint64x1_t b)3126 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
3127   return veor_u64(a, b);
3128 }
3129 
3130 // CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
3131 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
3132 // CHECK:   ret <16 x i8> [[XOR_I]]
test_veorq_s8(int8x16_t a,int8x16_t b)3133 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
3134   return veorq_s8(a, b);
3135 }
3136 
3137 // CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
3138 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
3139 // CHECK:   ret <8 x i16> [[XOR_I]]
test_veorq_s16(int16x8_t a,int16x8_t b)3140 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
3141   return veorq_s16(a, b);
3142 }
3143 
3144 // CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
3145 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
3146 // CHECK:   ret <4 x i32> [[XOR_I]]
test_veorq_s32(int32x4_t a,int32x4_t b)3147 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
3148   return veorq_s32(a, b);
3149 }
3150 
3151 // CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
3152 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
3153 // CHECK:   ret <2 x i64> [[XOR_I]]
test_veorq_s64(int64x2_t a,int64x2_t b)3154 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
3155   return veorq_s64(a, b);
3156 }
3157 
3158 // CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
3159 // CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
3160 // CHECK:   ret <16 x i8> [[XOR_I]]
test_veorq_u8(uint8x16_t a,uint8x16_t b)3161 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
3162   return veorq_u8(a, b);
3163 }
3164 
3165 // CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
3166 // CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
3167 // CHECK:   ret <8 x i16> [[XOR_I]]
test_veorq_u16(uint16x8_t a,uint16x8_t b)3168 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
3169   return veorq_u16(a, b);
3170 }
3171 
3172 // CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
3173 // CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
3174 // CHECK:   ret <4 x i32> [[XOR_I]]
test_veorq_u32(uint32x4_t a,uint32x4_t b)3175 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
3176   return veorq_u32(a, b);
3177 }
3178 
3179 // CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
3180 // CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
3181 // CHECK:   ret <2 x i64> [[XOR_I]]
test_veorq_u64(uint64x2_t a,uint64x2_t b)3182 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
3183   return veorq_u64(a, b);
3184 }
3185 
3186 
3187 // CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3188 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3189 // CHECK:   ret <8 x i8> [[VEXT]]
test_vext_s8(int8x8_t a,int8x8_t b)3190 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
3191   return vext_s8(a, b, 7);
3192 }
3193 
3194 // CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
3195 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3196 // CHECK:   ret <8 x i8> [[VEXT]]
test_vext_u8(uint8x8_t a,uint8x8_t b)3197 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
3198   return vext_u8(a, b, 7);
3199 }
3200 
3201 // CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
3202 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3203 // CHECK:   ret <8 x i8> [[VEXT]]
test_vext_p8(poly8x8_t a,poly8x8_t b)3204 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
3205   return vext_p8(a, b, 7);
3206 }
3207 
3208 // CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3209 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3210 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3211 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3212 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3213 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3214 // CHECK:   ret <4 x i16> [[VEXT]]
test_vext_s16(int16x4_t a,int16x4_t b)3215 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
3216   return vext_s16(a, b, 3);
3217 }
3218 
3219 // CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
3220 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3221 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3222 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3223 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3224 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3225 // CHECK:   ret <4 x i16> [[VEXT]]
test_vext_u16(uint16x4_t a,uint16x4_t b)3226 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
3227   return vext_u16(a, b, 3);
3228 }
3229 
3230 // CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
3231 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3232 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3233 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3234 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3235 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3236 // CHECK:   ret <4 x i16> [[VEXT]]
test_vext_p16(poly16x4_t a,poly16x4_t b)3237 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
3238   return vext_p16(a, b, 3);
3239 }
3240 
3241 // CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
3242 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3243 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3244 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3245 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3246 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3247 // CHECK:   ret <2 x i32> [[VEXT]]
test_vext_s32(int32x2_t a,int32x2_t b)3248 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
3249   return vext_s32(a, b, 1);
3250 }
3251 
3252 // CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
3253 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3254 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3255 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3256 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3257 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3258 // CHECK:   ret <2 x i32> [[VEXT]]
test_vext_u32(uint32x2_t a,uint32x2_t b)3259 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
3260   return vext_u32(a, b, 1);
3261 }
3262 
3263 // CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
3264 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3265 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3266 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3267 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3268 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3269 // CHECK:   ret <1 x i64> [[VEXT]]
test_vext_s64(int64x1_t a,int64x1_t b)3270 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
3271   return vext_s64(a, b, 0);
3272 }
3273 
3274 // CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
3275 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3276 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3277 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3278 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3279 // CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3280 // CHECK:   ret <1 x i64> [[VEXT]]
test_vext_u64(uint64x1_t a,uint64x1_t b)3281 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
3282   return vext_u64(a, b, 0);
3283 }
3284 
3285 // CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
3286 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3287 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3288 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3289 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3290 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
3291 // CHECK:   ret <2 x float> [[VEXT]]
test_vext_f32(float32x2_t a,float32x2_t b)3292 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
3293   return vext_f32(a, b, 1);
3294 }
3295 
3296 // CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
3297 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3298 // CHECK:   ret <16 x i8> [[VEXT]]
test_vextq_s8(int8x16_t a,int8x16_t b)3299 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
3300   return vextq_s8(a, b, 15);
3301 }
3302 
3303 // CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
3304 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3305 // CHECK:   ret <16 x i8> [[VEXT]]
test_vextq_u8(uint8x16_t a,uint8x16_t b)3306 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
3307   return vextq_u8(a, b, 15);
3308 }
3309 
3310 // CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
3311 // CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3312 // CHECK:   ret <16 x i8> [[VEXT]]
test_vextq_p8(poly8x16_t a,poly8x16_t b)3313 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
3314   return vextq_p8(a, b, 15);
3315 }
3316 
3317 // CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
3318 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3319 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3320 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3321 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3322 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3323 // CHECK:   ret <8 x i16> [[VEXT]]
test_vextq_s16(int16x8_t a,int16x8_t b)3324 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
3325   return vextq_s16(a, b, 7);
3326 }
3327 
3328 // CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
3329 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3330 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3331 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3332 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3333 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3334 // CHECK:   ret <8 x i16> [[VEXT]]
test_vextq_u16(uint16x8_t a,uint16x8_t b)3335 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
3336   return vextq_u16(a, b, 7);
3337 }
3338 
3339 // CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
3340 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3341 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3342 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3343 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3344 // CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3345 // CHECK:   ret <8 x i16> [[VEXT]]
test_vextq_p16(poly16x8_t a,poly16x8_t b)3346 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
3347   return vextq_p16(a, b, 7);
3348 }
3349 
3350 // CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
3351 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3352 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3353 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3354 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3355 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3356 // CHECK:   ret <4 x i32> [[VEXT]]
test_vextq_s32(int32x4_t a,int32x4_t b)3357 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
3358   return vextq_s32(a, b, 3);
3359 }
3360 
3361 // CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
3362 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3363 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3364 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3365 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3366 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3367 // CHECK:   ret <4 x i32> [[VEXT]]
test_vextq_u32(uint32x4_t a,uint32x4_t b)3368 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
3369   return vextq_u32(a, b, 3);
3370 }
3371 
3372 // CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
3373 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3374 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3375 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3376 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3377 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3378 // CHECK:   ret <2 x i64> [[VEXT]]
test_vextq_s64(int64x2_t a,int64x2_t b)3379 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
3380   return vextq_s64(a, b, 1);
3381 }
3382 
3383 // CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
3384 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3385 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3386 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3387 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3388 // CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3389 // CHECK:   ret <2 x i64> [[VEXT]]
test_vextq_u64(uint64x2_t a,uint64x2_t b)3390 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
3391   return vextq_u64(a, b, 1);
3392 }
3393 
3394 // CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
3395 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3396 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3397 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3398 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3399 // CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3400 // CHECK:   ret <4 x float> [[VEXT]]
test_vextq_f32(float32x4_t a,float32x4_t b)3401 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
3402   return vextq_f32(a, b, 3);
3403 }
3404 
3405 
3406 // CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
3407 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3408 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3409 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3410 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3411 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3412 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3413 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
3414 // CHECK:   ret <2 x float> [[TMP6]]
test_vfma_f32(float32x2_t a,float32x2_t b,float32x2_t c)3415 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3416   return vfma_f32(a, b, c);
3417 }
3418 
3419 // CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
3420 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3421 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3422 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3423 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3424 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3425 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3426 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
3427 // CHECK:   ret <4 x float> [[TMP6]]
test_vfmaq_f32(float32x4_t a,float32x4_t b,float32x4_t c)3428 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3429   return vfmaq_f32(a, b, c);
3430 }
3431 
3432 // CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
3433 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3434 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3435 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3436 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3437 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3438 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3439 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3440 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
3441 // CHECK:   ret <2 x float> [[TMP6]]
test_vfms_f32(float32x2_t a,float32x2_t b,float32x2_t c)3442 float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3443   return vfms_f32(a, b, c);
3444 }
3445 
3446 // CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
3447 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3448 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3449 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3450 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3451 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3452 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3453 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3454 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
3455 // CHECK:   ret <4 x float> [[TMP6]]
test_vfmsq_f32(float32x4_t a,float32x4_t b,float32x4_t c)3456 float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3457   return vfmsq_f32(a, b, c);
3458 }
3459 
3460 
3461 // CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
3462 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3463 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vget_high_s8(int8x16_t a)3464 int8x8_t test_vget_high_s8(int8x16_t a) {
3465   return vget_high_s8(a);
3466 }
3467 
3468 // CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
3469 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3470 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vget_high_s16(int16x8_t a)3471 int16x4_t test_vget_high_s16(int16x8_t a) {
3472   return vget_high_s16(a);
3473 }
3474 
3475 // CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
3476 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3477 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
test_vget_high_s32(int32x4_t a)3478 int32x2_t test_vget_high_s32(int32x4_t a) {
3479   return vget_high_s32(a);
3480 }
3481 
3482 // CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
3483 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3484 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
test_vget_high_s64(int64x2_t a)3485 int64x1_t test_vget_high_s64(int64x2_t a) {
3486   return vget_high_s64(a);
3487 }
3488 
3489 // CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
3490 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3491 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
test_vget_high_f16(float16x8_t a)3492 float16x4_t test_vget_high_f16(float16x8_t a) {
3493   return vget_high_f16(a);
3494 }
3495 
3496 // CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
3497 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
3498 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
test_vget_high_f32(float32x4_t a)3499 float32x2_t test_vget_high_f32(float32x4_t a) {
3500   return vget_high_f32(a);
3501 }
3502 
3503 // CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
3504 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3505 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vget_high_u8(uint8x16_t a)3506 uint8x8_t test_vget_high_u8(uint8x16_t a) {
3507   return vget_high_u8(a);
3508 }
3509 
3510 // CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
3511 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3512 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vget_high_u16(uint16x8_t a)3513 uint16x4_t test_vget_high_u16(uint16x8_t a) {
3514   return vget_high_u16(a);
3515 }
3516 
3517 // CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
3518 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3519 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
test_vget_high_u32(uint32x4_t a)3520 uint32x2_t test_vget_high_u32(uint32x4_t a) {
3521   return vget_high_u32(a);
3522 }
3523 
3524 // CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
3525 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3526 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
test_vget_high_u64(uint64x2_t a)3527 uint64x1_t test_vget_high_u64(uint64x2_t a) {
3528   return vget_high_u64(a);
3529 }
3530 
3531 // CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
3532 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3533 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vget_high_p8(poly8x16_t a)3534 poly8x8_t test_vget_high_p8(poly8x16_t a) {
3535   return vget_high_p8(a);
3536 }
3537 
3538 // CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
3539 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3540 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vget_high_p16(poly16x8_t a)3541 poly16x4_t test_vget_high_p16(poly16x8_t a) {
3542   return vget_high_p16(a);
3543 }
3544 
3545 
3546 // CHECK-LABEL: define zeroext i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
3547 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3548 // CHECK:   ret i8 [[VGET_LANE]]
test_vget_lane_u8(uint8x8_t a)3549 uint8_t test_vget_lane_u8(uint8x8_t a) {
3550   return vget_lane_u8(a, 7);
3551 }
3552 
3553 // CHECK-LABEL: define zeroext i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
3554 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3555 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3556 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3557 // CHECK:   ret i16 [[VGET_LANE]]
test_vget_lane_u16(uint16x4_t a)3558 uint16_t test_vget_lane_u16(uint16x4_t a) {
3559   return vget_lane_u16(a, 3);
3560 }
3561 
3562 // CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
3563 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3564 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3565 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3566 // CHECK:   ret i32 [[VGET_LANE]]
test_vget_lane_u32(uint32x2_t a)3567 uint32_t test_vget_lane_u32(uint32x2_t a) {
3568   return vget_lane_u32(a, 1);
3569 }
3570 
3571 // CHECK-LABEL: define signext i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
3572 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3573 // CHECK:   ret i8 [[VGET_LANE]]
test_vget_lane_s8(int8x8_t a)3574 int8_t test_vget_lane_s8(int8x8_t a) {
3575   return vget_lane_s8(a, 7);
3576 }
3577 
3578 // CHECK-LABEL: define signext i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
3579 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3580 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3581 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3582 // CHECK:   ret i16 [[VGET_LANE]]
test_vget_lane_s16(int16x4_t a)3583 int16_t test_vget_lane_s16(int16x4_t a) {
3584   return vget_lane_s16(a, 3);
3585 }
3586 
3587 // CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
3588 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3589 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3590 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3591 // CHECK:   ret i32 [[VGET_LANE]]
test_vget_lane_s32(int32x2_t a)3592 int32_t test_vget_lane_s32(int32x2_t a) {
3593   return vget_lane_s32(a, 1);
3594 }
3595 
3596 // CHECK-LABEL: define signext i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
3597 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3598 // CHECK:   ret i8 [[VGET_LANE]]
test_vget_lane_p8(poly8x8_t a)3599 poly8_t test_vget_lane_p8(poly8x8_t a) {
3600   return vget_lane_p8(a, 7);
3601 }
3602 
3603 // CHECK-LABEL: define signext i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
3604 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3605 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3606 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3607 // CHECK:   ret i16 [[VGET_LANE]]
test_vget_lane_p16(poly16x4_t a)3608 poly16_t test_vget_lane_p16(poly16x4_t a) {
3609   return vget_lane_p16(a, 3);
3610 }
3611 
3612 // CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
3613 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3614 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3615 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
3616 // CHECK:   ret float [[VGET_LANE]]
test_vget_lane_f32(float32x2_t a)3617 float32_t test_vget_lane_f32(float32x2_t a) {
3618   return vget_lane_f32(a, 1);
3619 }
3620 
3621 // CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
3622 // CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
3623 // CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
3624 // CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
3625 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
3626 // CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
3627 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
3628 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
3629 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
3630 // CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
3631 // CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
3632 // CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
3633 // CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
3634 // CHECK:   ret float [[CONV]]
test_vget_lane_f16(float16x4_t a)3635 float32_t test_vget_lane_f16(float16x4_t a) {
3636   return vget_lane_f16(a, 1);
3637 }
3638 
3639 // CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
3640 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3641 // CHECK:   ret i8 [[VGET_LANE]]
test_vgetq_lane_u8(uint8x16_t a)3642 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
3643   return vgetq_lane_u8(a, 15);
3644 }
3645 
3646 // CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
3647 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3648 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3649 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3650 // CHECK:   ret i16 [[VGET_LANE]]
test_vgetq_lane_u16(uint16x8_t a)3651 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
3652   return vgetq_lane_u16(a, 7);
3653 }
3654 
3655 // CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
3656 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3657 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3658 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
3659 // CHECK:   ret i32 [[VGET_LANE]]
test_vgetq_lane_u32(uint32x4_t a)3660 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
3661   return vgetq_lane_u32(a, 3);
3662 }
3663 
3664 // CHECK-LABEL: define signext i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
3665 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3666 // CHECK:   ret i8 [[VGET_LANE]]
test_vgetq_lane_s8(int8x16_t a)3667 int8_t test_vgetq_lane_s8(int8x16_t a) {
3668   return vgetq_lane_s8(a, 15);
3669 }
3670 
3671 // CHECK-LABEL: define signext i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
3672 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3673 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3674 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3675 // CHECK:   ret i16 [[VGET_LANE]]
test_vgetq_lane_s16(int16x8_t a)3676 int16_t test_vgetq_lane_s16(int16x8_t a) {
3677   return vgetq_lane_s16(a, 7);
3678 }
3679 
3680 // CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
3681 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3682 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3683 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
3684 // CHECK:   ret i32 [[VGET_LANE]]
test_vgetq_lane_s32(int32x4_t a)3685 int32_t test_vgetq_lane_s32(int32x4_t a) {
3686   return vgetq_lane_s32(a, 3);
3687 }
3688 
3689 // CHECK-LABEL: define signext i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
3690 // CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3691 // CHECK:   ret i8 [[VGET_LANE]]
test_vgetq_lane_p8(poly8x16_t a)3692 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
3693   return vgetq_lane_p8(a, 15);
3694 }
3695 
3696 // CHECK-LABEL: define signext i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
3697 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3698 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3699 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3700 // CHECK:   ret i16 [[VGET_LANE]]
test_vgetq_lane_p16(poly16x8_t a)3701 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
3702   return vgetq_lane_p16(a, 7);
3703 }
3704 
3705 // CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
3706 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3707 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3708 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
3709 // CHECK:   ret float [[VGET_LANE]]
test_vgetq_lane_f32(float32x4_t a)3710 float32_t test_vgetq_lane_f32(float32x4_t a) {
3711   return vgetq_lane_f32(a, 3);
3712 }
3713 
3714 // CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
3715 // CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
3716 // CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
3717 // CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
3718 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
3719 // CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
3720 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
3721 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
3722 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
3723 // CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_244]], align 2
3724 // CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
3725 // CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
3726 // CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
3727 // CHECK:   ret float [[CONV]]
test_vgetq_lane_f16(float16x8_t a)3728 float32_t test_vgetq_lane_f16(float16x8_t a) {
3729   return vgetq_lane_f16(a, 3);
3730 }
3731 
3732 // The optimizer is able to remove all moves now.
3733 // CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
3734 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3735 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3736 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
3737 // CHECK:   ret i64 [[VGET_LANE]]
test_vget_lane_s64(int64x1_t a)3738 int64_t test_vget_lane_s64(int64x1_t a) {
3739   return vget_lane_s64(a, 0);
3740 }
3741 
3742 // The optimizer is able to remove all moves now.
3743 // CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
3744 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3745 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3746 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
3747 // CHECK:   ret i64 [[VGET_LANE]]
test_vget_lane_u64(uint64x1_t a)3748 uint64_t test_vget_lane_u64(uint64x1_t a) {
3749   return vget_lane_u64(a, 0);
3750 }
3751 
3752 // CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
3753 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3754 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3755 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
3756 // CHECK:   ret i64 [[VGET_LANE]]
test_vgetq_lane_s64(int64x2_t a)3757 int64_t test_vgetq_lane_s64(int64x2_t a) {
3758   return vgetq_lane_s64(a, 1);
3759 }
3760 
3761 // CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
3762 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3763 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3764 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
3765 // CHECK:   ret i64 [[VGET_LANE]]
test_vgetq_lane_u64(uint64x2_t a)3766 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
3767   return vgetq_lane_u64(a, 1);
3768 }
3769 
3770 
3771 // CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
3772 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3773 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vget_low_s8(int8x16_t a)3774 int8x8_t test_vget_low_s8(int8x16_t a) {
3775   return vget_low_s8(a);
3776 }
3777 
3778 // CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
3779 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3780 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vget_low_s16(int16x8_t a)3781 int16x4_t test_vget_low_s16(int16x8_t a) {
3782   return vget_low_s16(a);
3783 }
3784 
3785 // CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
3786 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3787 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
test_vget_low_s32(int32x4_t a)3788 int32x2_t test_vget_low_s32(int32x4_t a) {
3789   return vget_low_s32(a);
3790 }
3791 
3792 // CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
3793 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3794 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
test_vget_low_s64(int64x2_t a)3795 int64x1_t test_vget_low_s64(int64x2_t a) {
3796   return vget_low_s64(a);
3797 }
3798 
3799 // CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
3800 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3801 // CHECK:   ret <4 x half> [[SHUFFLE_I]]
test_vget_low_f16(float16x8_t a)3802 float16x4_t test_vget_low_f16(float16x8_t a) {
3803   return vget_low_f16(a);
3804 }
3805 
3806 // CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
3807 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
3808 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
test_vget_low_f32(float32x4_t a)3809 float32x2_t test_vget_low_f32(float32x4_t a) {
3810   return vget_low_f32(a);
3811 }
3812 
3813 // CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
3814 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3815 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vget_low_u8(uint8x16_t a)3816 uint8x8_t test_vget_low_u8(uint8x16_t a) {
3817   return vget_low_u8(a);
3818 }
3819 
3820 // CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
3821 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3822 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vget_low_u16(uint16x8_t a)3823 uint16x4_t test_vget_low_u16(uint16x8_t a) {
3824   return vget_low_u16(a);
3825 }
3826 
3827 // CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
3828 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3829 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
test_vget_low_u32(uint32x4_t a)3830 uint32x2_t test_vget_low_u32(uint32x4_t a) {
3831   return vget_low_u32(a);
3832 }
3833 
3834 // CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
3835 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3836 // CHECK:   ret <1 x i64> [[SHUFFLE_I]]
test_vget_low_u64(uint64x2_t a)3837 uint64x1_t test_vget_low_u64(uint64x2_t a) {
3838   return vget_low_u64(a);
3839 }
3840 
3841 // CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
3842 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3843 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vget_low_p8(poly8x16_t a)3844 poly8x8_t test_vget_low_p8(poly8x16_t a) {
3845   return vget_low_p8(a);
3846 }
3847 
3848 // CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
3849 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3850 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vget_low_p16(poly16x8_t a)3851 poly16x4_t test_vget_low_p16(poly16x8_t a) {
3852   return vget_low_p16(a);
3853 }
3854 
3855 
3856 // CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3857 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
3858 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
test_vhadd_s8(int8x8_t a,int8x8_t b)3859 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
3860   return vhadd_s8(a, b);
3861 }
3862 
3863 // CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3864 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3865 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3866 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3867 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3868 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
3869 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3870 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
3871 // CHECK:   ret <4 x i16> [[TMP2]]
test_vhadd_s16(int16x4_t a,int16x4_t b)3872 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
3873   return vhadd_s16(a, b);
3874 }
3875 
3876 // CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
3877 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3878 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3879 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3880 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3881 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
3882 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3883 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
3884 // CHECK:   ret <2 x i32> [[TMP2]]
test_vhadd_s32(int32x2_t a,int32x2_t b)3885 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
3886   return vhadd_s32(a, b);
3887 }
3888 
3889 // CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
3890 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
3891 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
test_vhadd_u8(uint8x8_t a,uint8x8_t b)3892 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
3893   return vhadd_u8(a, b);
3894 }
3895 
3896 // CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
3897 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3898 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3899 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3900 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3901 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
3902 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3903 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
3904 // CHECK:   ret <4 x i16> [[TMP2]]
test_vhadd_u16(uint16x4_t a,uint16x4_t b)3905 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
3906   return vhadd_u16(a, b);
3907 }
3908 
3909 // CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
3910 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3911 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3912 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3913 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3914 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
3915 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3916 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
3917 // CHECK:   ret <2 x i32> [[TMP2]]
test_vhadd_u32(uint32x2_t a,uint32x2_t b)3918 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
3919   return vhadd_u32(a, b);
3920 }
3921 
3922 // CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
3923 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
3924 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
test_vhaddq_s8(int8x16_t a,int8x16_t b)3925 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
3926   return vhaddq_s8(a, b);
3927 }
3928 
3929 // CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
3930 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3931 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3932 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3933 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3934 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
3935 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3936 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
3937 // CHECK:   ret <8 x i16> [[TMP2]]
test_vhaddq_s16(int16x8_t a,int16x8_t b)3938 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
3939   return vhaddq_s16(a, b);
3940 }
3941 
3942 // CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
3943 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3944 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3945 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3946 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3947 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
3948 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3949 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
3950 // CHECK:   ret <4 x i32> [[TMP2]]
test_vhaddq_s32(int32x4_t a,int32x4_t b)3951 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
3952   return vhaddq_s32(a, b);
3953 }
3954 
3955 // CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
3956 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
3957 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
test_vhaddq_u8(uint8x16_t a,uint8x16_t b)3958 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
3959   return vhaddq_u8(a, b);
3960 }
3961 
3962 // CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
3963 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3964 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3965 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3966 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3967 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
3968 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3969 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
3970 // CHECK:   ret <8 x i16> [[TMP2]]
test_vhaddq_u16(uint16x8_t a,uint16x8_t b)3971 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
3972   return vhaddq_u16(a, b);
3973 }
3974 
3975 // CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
3976 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3977 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3978 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3979 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3980 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
3981 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3982 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
3983 // CHECK:   ret <4 x i32> [[TMP2]]
test_vhaddq_u32(uint32x4_t a,uint32x4_t b)3984 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
3985   return vhaddq_u32(a, b);
3986 }
3987 
3988 
3989 // CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3990 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
3991 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
test_vhsub_s8(int8x8_t a,int8x8_t b)3992 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
3993   return vhsub_s8(a, b);
3994 }
3995 
3996 // CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3997 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3998 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3999 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4000 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4001 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
4002 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
4003 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
4004 // CHECK:   ret <4 x i16> [[TMP2]]
test_vhsub_s16(int16x4_t a,int16x4_t b)4005 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
4006   return vhsub_s16(a, b);
4007 }
4008 
4009 // CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
4010 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4011 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4012 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4013 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4014 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
4015 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
4016 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
4017 // CHECK:   ret <2 x i32> [[TMP2]]
test_vhsub_s32(int32x2_t a,int32x2_t b)4018 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
4019   return vhsub_s32(a, b);
4020 }
4021 
4022 // CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
4023 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
4024 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
test_vhsub_u8(uint8x8_t a,uint8x8_t b)4025 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
4026   return vhsub_u8(a, b);
4027 }
4028 
4029 // CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
4030 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4031 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4032 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4033 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4034 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
4035 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
4036 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
4037 // CHECK:   ret <4 x i16> [[TMP2]]
test_vhsub_u16(uint16x4_t a,uint16x4_t b)4038 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
4039   return vhsub_u16(a, b);
4040 }
4041 
4042 // CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
4043 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4044 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4045 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4046 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4047 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
4048 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
4049 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
4050 // CHECK:   ret <2 x i32> [[TMP2]]
test_vhsub_u32(uint32x2_t a,uint32x2_t b)4051 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
4052   return vhsub_u32(a, b);
4053 }
4054 
4055 // CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
4056 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
4057 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
test_vhsubq_s8(int8x16_t a,int8x16_t b)4058 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
4059   return vhsubq_s8(a, b);
4060 }
4061 
4062 // CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
4063 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4064 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4065 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4066 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4067 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
4068 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
4069 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
4070 // CHECK:   ret <8 x i16> [[TMP2]]
test_vhsubq_s16(int16x8_t a,int16x8_t b)4071 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
4072   return vhsubq_s16(a, b);
4073 }
4074 
4075 // CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
4076 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4077 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4078 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4079 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4080 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
4081 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
4082 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
4083 // CHECK:   ret <4 x i32> [[TMP2]]
test_vhsubq_s32(int32x4_t a,int32x4_t b)4084 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
4085   return vhsubq_s32(a, b);
4086 }
4087 
4088 // CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
4089 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
4090 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
test_vhsubq_u8(uint8x16_t a,uint8x16_t b)4091 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
4092   return vhsubq_u8(a, b);
4093 }
4094 
4095 // CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
4096 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4097 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4098 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4099 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4100 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
4101 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
4102 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
4103 // CHECK:   ret <8 x i16> [[TMP2]]
test_vhsubq_u16(uint16x8_t a,uint16x8_t b)4104 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
4105   return vhsubq_u16(a, b);
4106 }
4107 
4108 // CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
4109 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4110 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4111 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4112 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4113 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
4114 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
4115 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
4116 // CHECK:   ret <4 x i32> [[TMP2]]
test_vhsubq_u32(uint32x4_t a,uint32x4_t b)4117 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
4118   return vhsubq_u32(a, b);
4119 }
4120 
4121 
4122 // CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
4123 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
4124 // CHECK:   ret <16 x i8> [[VLD1]]
test_vld1q_u8(uint8_t const * a)4125 uint8x16_t test_vld1q_u8(uint8_t const * a) {
4126   return vld1q_u8(a);
4127 }
4128 
4129 // CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
4130 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4131 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4132 // CHECK:   ret <8 x i16> [[VLD1]]
test_vld1q_u16(uint16_t const * a)4133 uint16x8_t test_vld1q_u16(uint16_t const * a) {
4134   return vld1q_u16(a);
4135 }
4136 
4137 // CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
4138 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4139 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
4140 // CHECK:   ret <4 x i32> [[VLD1]]
test_vld1q_u32(uint32_t const * a)4141 uint32x4_t test_vld1q_u32(uint32_t const * a) {
4142   return vld1q_u32(a);
4143 }
4144 
4145 // CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
4146 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4147 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
4148 // CHECK:   ret <2 x i64> [[VLD1]]
test_vld1q_u64(uint64_t const * a)4149 uint64x2_t test_vld1q_u64(uint64_t const * a) {
4150   return vld1q_u64(a);
4151 }
4152 
4153 // CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
4154 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
4155 // CHECK:   ret <16 x i8> [[VLD1]]
test_vld1q_s8(int8_t const * a)4156 int8x16_t test_vld1q_s8(int8_t const * a) {
4157   return vld1q_s8(a);
4158 }
4159 
4160 // CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
4161 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4162 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4163 // CHECK:   ret <8 x i16> [[VLD1]]
test_vld1q_s16(int16_t const * a)4164 int16x8_t test_vld1q_s16(int16_t const * a) {
4165   return vld1q_s16(a);
4166 }
4167 
4168 // CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
4169 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4170 // CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
4171 // CHECK:   ret <4 x i32> [[VLD1]]
test_vld1q_s32(int32_t const * a)4172 int32x4_t test_vld1q_s32(int32_t const * a) {
4173   return vld1q_s32(a);
4174 }
4175 
4176 // CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
4177 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4178 // CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
4179 // CHECK:   ret <2 x i64> [[VLD1]]
test_vld1q_s64(int64_t const * a)4180 int64x2_t test_vld1q_s64(int64_t const * a) {
4181   return vld1q_s64(a);
4182 }
4183 
4184 // CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
4185 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4186 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4187 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
4188 // CHECK:   ret <8 x half> [[TMP1]]
test_vld1q_f16(float16_t const * a)4189 float16x8_t test_vld1q_f16(float16_t const * a) {
4190   return vld1q_f16(a);
4191 }
4192 
4193 // CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
4194 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4195 // CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* [[TMP0]], i32 4)
4196 // CHECK:   ret <4 x float> [[VLD1]]
test_vld1q_f32(float32_t const * a)4197 float32x4_t test_vld1q_f32(float32_t const * a) {
4198   return vld1q_f32(a);
4199 }
4200 
4201 // CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
4202 // CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
4203 // CHECK:   ret <16 x i8> [[VLD1]]
test_vld1q_p8(poly8_t const * a)4204 poly8x16_t test_vld1q_p8(poly8_t const * a) {
4205   return vld1q_p8(a);
4206 }
4207 
4208 // CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
4209 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4210 // CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4211 // CHECK:   ret <8 x i16> [[VLD1]]
test_vld1q_p16(poly16_t const * a)4212 poly16x8_t test_vld1q_p16(poly16_t const * a) {
4213   return vld1q_p16(a);
4214 }
4215 
4216 // CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
4217 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
4218 // CHECK:   ret <8 x i8> [[VLD1]]
test_vld1_u8(uint8_t const * a)4219 uint8x8_t test_vld1_u8(uint8_t const * a) {
4220   return vld1_u8(a);
4221 }
4222 
4223 // CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
4224 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4225 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4226 // CHECK:   ret <4 x i16> [[VLD1]]
test_vld1_u16(uint16_t const * a)4227 uint16x4_t test_vld1_u16(uint16_t const * a) {
4228   return vld1_u16(a);
4229 }
4230 
4231 // CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
4232 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4233 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
4234 // CHECK:   ret <2 x i32> [[VLD1]]
test_vld1_u32(uint32_t const * a)4235 uint32x2_t test_vld1_u32(uint32_t const * a) {
4236   return vld1_u32(a);
4237 }
4238 
4239 // CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
4240 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4241 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4242 // CHECK:   ret <1 x i64> [[VLD1]]
test_vld1_u64(uint64_t const * a)4243 uint64x1_t test_vld1_u64(uint64_t const * a) {
4244   return vld1_u64(a);
4245 }
4246 
4247 // CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
4248 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
4249 // CHECK:   ret <8 x i8> [[VLD1]]
test_vld1_s8(int8_t const * a)4250 int8x8_t test_vld1_s8(int8_t const * a) {
4251   return vld1_s8(a);
4252 }
4253 
4254 // CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
4255 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4256 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4257 // CHECK:   ret <4 x i16> [[VLD1]]
test_vld1_s16(int16_t const * a)4258 int16x4_t test_vld1_s16(int16_t const * a) {
4259   return vld1_s16(a);
4260 }
4261 
4262 // CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
4263 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4264 // CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
4265 // CHECK:   ret <2 x i32> [[VLD1]]
test_vld1_s32(int32_t const * a)4266 int32x2_t test_vld1_s32(int32_t const * a) {
4267   return vld1_s32(a);
4268 }
4269 
4270 // CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
4271 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4272 // CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4273 // CHECK:   ret <1 x i64> [[VLD1]]
test_vld1_s64(int64_t const * a)4274 int64x1_t test_vld1_s64(int64_t const * a) {
4275   return vld1_s64(a);
4276 }
4277 
4278 // CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
4279 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4280 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4281 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
4282 // CHECK:   ret <4 x half> [[TMP1]]
test_vld1_f16(float16_t const * a)4283 float16x4_t test_vld1_f16(float16_t const * a) {
4284   return vld1_f16(a);
4285 }
4286 
4287 // CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
4288 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4289 // CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* [[TMP0]], i32 4)
4290 // CHECK:   ret <2 x float> [[VLD1]]
test_vld1_f32(float32_t const * a)4291 float32x2_t test_vld1_f32(float32_t const * a) {
4292   return vld1_f32(a);
4293 }
4294 
4295 // CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
4296 // CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
4297 // CHECK:   ret <8 x i8> [[VLD1]]
test_vld1_p8(poly8_t const * a)4298 poly8x8_t test_vld1_p8(poly8_t const * a) {
4299   return vld1_p8(a);
4300 }
4301 
4302 // CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
4303 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4304 // CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4305 // CHECK:   ret <4 x i16> [[VLD1]]
test_vld1_p16(poly16_t const * a)4306 poly16x4_t test_vld1_p16(poly16_t const * a) {
4307   return vld1_p16(a);
4308 }
4309 
4310 
4311 // CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
4312 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4313 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4314 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4315 // CHECK:   ret <16 x i8> [[LANE]]
test_vld1q_dup_u8(uint8_t const * a)4316 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
4317   return vld1q_dup_u8(a);
4318 }
4319 
4320 // CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
4321 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4322 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4323 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4324 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4325 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4326 // CHECK:   ret <8 x i16> [[LANE]]
test_vld1q_dup_u16(uint16_t const * a)4327 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
4328   return vld1q_dup_u16(a);
4329 }
4330 
4331 // CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
4332 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4333 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4334 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4335 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
4336 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4337 // CHECK:   ret <4 x i32> [[LANE]]
test_vld1q_dup_u32(uint32_t const * a)4338 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
4339   return vld1q_dup_u32(a);
4340 }
4341 
4342 // CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
4343 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4344 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4345 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4346 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
4347 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4348 // CHECK:   ret <2 x i64> [[LANE]]
test_vld1q_dup_u64(uint64_t const * a)4349 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
4350   return vld1q_dup_u64(a);
4351 }
4352 
4353 // CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
4354 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4355 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4356 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4357 // CHECK:   ret <16 x i8> [[LANE]]
test_vld1q_dup_s8(int8_t const * a)4358 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
4359   return vld1q_dup_s8(a);
4360 }
4361 
4362 // CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
4363 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4364 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4365 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4366 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4367 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4368 // CHECK:   ret <8 x i16> [[LANE]]
test_vld1q_dup_s16(int16_t const * a)4369 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
4370   return vld1q_dup_s16(a);
4371 }
4372 
4373 // CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
4374 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4375 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4376 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4377 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
4378 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4379 // CHECK:   ret <4 x i32> [[LANE]]
test_vld1q_dup_s32(int32_t const * a)4380 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
4381   return vld1q_dup_s32(a);
4382 }
4383 
4384 // CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
4385 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4386 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4387 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4388 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
4389 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4390 // CHECK:   ret <2 x i64> [[LANE]]
test_vld1q_dup_s64(int64_t const * a)4391 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
4392   return vld1q_dup_s64(a);
4393 }
4394 
4395 // CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
4396 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4397 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4398 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4399 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4400 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4401 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
4402 // CHECK:   ret <8 x half> [[TMP4]]
test_vld1q_dup_f16(float16_t const * a)4403 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
4404   return vld1q_dup_f16(a);
4405 }
4406 
4407 // CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
4408 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4409 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
4410 // CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
4411 // CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
4412 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
4413 // CHECK:   ret <4 x float> [[LANE]]
test_vld1q_dup_f32(float32_t const * a)4414 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
4415   return vld1q_dup_f32(a);
4416 }
4417 
4418 // CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
4419 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4420 // CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4421 // CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4422 // CHECK:   ret <16 x i8> [[LANE]]
test_vld1q_dup_p8(poly8_t const * a)4423 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
4424   return vld1q_dup_p8(a);
4425 }
4426 
4427 // CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
4428 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4429 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4430 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4431 // CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4432 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4433 // CHECK:   ret <8 x i16> [[LANE]]
test_vld1q_dup_p16(poly16_t const * a)4434 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
4435   return vld1q_dup_p16(a);
4436 }
4437 
4438 // CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
4439 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4440 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4441 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4442 // CHECK:   ret <8 x i8> [[LANE]]
test_vld1_dup_u8(uint8_t const * a)4443 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
4444   return vld1_dup_u8(a);
4445 }
4446 
4447 // CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
4448 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4449 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4450 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4451 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4452 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4453 // CHECK:   ret <4 x i16> [[LANE]]
test_vld1_dup_u16(uint16_t const * a)4454 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
4455   return vld1_dup_u16(a);
4456 }
4457 
4458 // CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
4459 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4460 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4461 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4462 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
4463 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4464 // CHECK:   ret <2 x i32> [[LANE]]
test_vld1_dup_u32(uint32_t const * a)4465 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
4466   return vld1_dup_u32(a);
4467 }
4468 
4469 // CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
4470 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4471 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4472 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4473 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
4474 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4475 // CHECK:   ret <1 x i64> [[LANE]]
test_vld1_dup_u64(uint64_t const * a)4476 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
4477   return vld1_dup_u64(a);
4478 }
4479 
4480 // CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
4481 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4482 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4483 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4484 // CHECK:   ret <8 x i8> [[LANE]]
test_vld1_dup_s8(int8_t const * a)4485 int8x8_t test_vld1_dup_s8(int8_t const * a) {
4486   return vld1_dup_s8(a);
4487 }
4488 
4489 // CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
4490 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4491 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4492 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4493 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4494 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4495 // CHECK:   ret <4 x i16> [[LANE]]
test_vld1_dup_s16(int16_t const * a)4496 int16x4_t test_vld1_dup_s16(int16_t const * a) {
4497   return vld1_dup_s16(a);
4498 }
4499 
4500 // CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
4501 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4502 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4503 // CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4504 // CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
4505 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4506 // CHECK:   ret <2 x i32> [[LANE]]
test_vld1_dup_s32(int32_t const * a)4507 int32x2_t test_vld1_dup_s32(int32_t const * a) {
4508   return vld1_dup_s32(a);
4509 }
4510 
4511 // CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
4512 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4513 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4514 // CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4515 // CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
4516 // CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4517 // CHECK:   ret <1 x i64> [[LANE]]
test_vld1_dup_s64(int64_t const * a)4518 int64x1_t test_vld1_dup_s64(int64_t const * a) {
4519   return vld1_dup_s64(a);
4520 }
4521 
4522 // CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
4523 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4524 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4525 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4526 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4527 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4528 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
4529 // CHECK:   ret <4 x half> [[TMP4]]
test_vld1_dup_f16(float16_t const * a)4530 float16x4_t test_vld1_dup_f16(float16_t const * a) {
4531   return vld1_dup_f16(a);
4532 }
4533 
4534 // CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
4535 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4536 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
4537 // CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
4538 // CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
4539 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
4540 // CHECK:   ret <2 x float> [[LANE]]
test_vld1_dup_f32(float32_t const * a)4541 float32x2_t test_vld1_dup_f32(float32_t const * a) {
4542   return vld1_dup_f32(a);
4543 }
4544 
4545 // CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
4546 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4547 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4548 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4549 // CHECK:   ret <8 x i8> [[LANE]]
test_vld1_dup_p8(poly8_t const * a)4550 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
4551   return vld1_dup_p8(a);
4552 }
4553 
4554 // CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
4555 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4556 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4557 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4558 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4559 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4560 // CHECK:   ret <4 x i16> [[LANE]]
test_vld1_dup_p16(poly16_t const * a)4561 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
4562   return vld1_dup_p16(a);
4563 }
4564 
4565 
4566 // CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
4567 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4568 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4569 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
test_vld1q_lane_u8(uint8_t const * a,uint8x16_t b)4570 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
4571   return vld1q_lane_u8(a, b, 15);
4572 }
4573 
4574 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
4575 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4576 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4577 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4578 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4579 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4580 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4581 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
test_vld1q_lane_u16(uint16_t const * a,uint16x8_t b)4582 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
4583   return vld1q_lane_u16(a, b, 7);
4584 }
4585 
4586 // CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
4587 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4588 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4589 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4590 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4591 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4592 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4593 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
test_vld1q_lane_u32(uint32_t const * a,uint32x4_t b)4594 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
4595   return vld1q_lane_u32(a, b, 3);
4596 }
4597 
4598 // CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
4599 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4600 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4601 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4602 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4603 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4604 // CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4605 // CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
test_vld1q_lane_u64(uint64_t const * a,uint64x2_t b)4606 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
4607   return vld1q_lane_u64(a, b, 1);
4608 }
4609 
4610 // CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
4611 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4612 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4613 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
test_vld1q_lane_s8(int8_t const * a,int8x16_t b)4614 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
4615   return vld1q_lane_s8(a, b, 15);
4616 }
4617 
4618 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
4619 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4620 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4621 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4622 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4623 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4624 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4625 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
test_vld1q_lane_s16(int16_t const * a,int16x8_t b)4626 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
4627   return vld1q_lane_s16(a, b, 7);
4628 }
4629 
4630 // CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
4631 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4632 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4633 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4634 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4635 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4636 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4637 // CHECK:   ret <4 x i32> [[VLD1_LANE]]
test_vld1q_lane_s32(int32_t const * a,int32x4_t b)4638 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
4639   return vld1q_lane_s32(a, b, 3);
4640 }
4641 
4642 // CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
4643 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4644 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4645 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4646 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4647 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4648 // CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4649 // CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
test_vld1q_lane_s64(int64_t const * a,int64x2_t b)4650 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
4651   return vld1q_lane_s64(a, b, 1);
4652 }
4653 
4654 // CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
4655 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4656 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
4657 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4658 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4659 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4660 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4661 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
4662 // CHECK:   ret <8 x half> [[TMP5]]
test_vld1q_lane_f16(float16_t const * a,float16x8_t b)4663 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
4664   return vld1q_lane_f16(a, b, 7);
4665 }
4666 
4667 // CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
4668 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4669 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
4670 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
4671 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
4672 // CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
4673 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
4674 // CHECK:   ret <4 x float> [[VLD1_LANE]]
test_vld1q_lane_f32(float32_t const * a,float32x4_t b)4675 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
4676   return vld1q_lane_f32(a, b, 3);
4677 }
4678 
4679 // CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
4680 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4681 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4682 // CHECK:   ret <16 x i8> [[VLD1_LANE]]
test_vld1q_lane_p8(poly8_t const * a,poly8x16_t b)4683 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
4684   return vld1q_lane_p8(a, b, 15);
4685 }
4686 
4687 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
4688 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4689 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4690 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4691 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4692 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4693 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4694 // CHECK:   ret <8 x i16> [[VLD1_LANE]]
test_vld1q_lane_p16(poly16_t const * a,poly16x8_t b)4695 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
4696   return vld1q_lane_p16(a, b, 7);
4697 }
4698 
4699 // CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
4700 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4701 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4702 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
test_vld1_lane_u8(uint8_t const * a,uint8x8_t b)4703 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
4704   return vld1_lane_u8(a, b, 7);
4705 }
4706 
4707 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
4708 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4709 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4710 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4711 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4712 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4713 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4714 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
test_vld1_lane_u16(uint16_t const * a,uint16x4_t b)4715 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
4716   return vld1_lane_u16(a, b, 3);
4717 }
4718 
4719 // CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
4720 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4721 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4722 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4723 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4724 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4725 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4726 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
test_vld1_lane_u32(uint32_t const * a,uint32x2_t b)4727 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
4728   return vld1_lane_u32(a, b, 1);
4729 }
4730 
4731 // CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
4732 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4733 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4734 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4735 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
4736 // CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
4737 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4738 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
test_vld1_lane_u64(uint64_t const * a,uint64x1_t b)4739 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
4740   return vld1_lane_u64(a, b, 0);
4741 }
4742 
4743 // CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
4744 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4745 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4746 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
test_vld1_lane_s8(int8_t const * a,int8x8_t b)4747 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
4748   return vld1_lane_s8(a, b, 7);
4749 }
4750 
4751 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
4752 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4753 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4754 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4755 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4756 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4757 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4758 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
test_vld1_lane_s16(int16_t const * a,int16x4_t b)4759 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
4760   return vld1_lane_s16(a, b, 3);
4761 }
4762 
4763 // CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
4764 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4765 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4766 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4767 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4768 // CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4769 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4770 // CHECK:   ret <2 x i32> [[VLD1_LANE]]
test_vld1_lane_s32(int32_t const * a,int32x2_t b)4771 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
4772   return vld1_lane_s32(a, b, 1);
4773 }
4774 
4775 // CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
4776 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4777 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4778 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4779 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
4780 // CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
4781 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4782 // CHECK:   ret <1 x i64> [[VLD1_LANE]]
test_vld1_lane_s64(int64_t const * a,int64x1_t b)4783 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
4784   return vld1_lane_s64(a, b, 0);
4785 }
4786 
4787 // CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
4788 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4789 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
4790 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4791 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4792 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4793 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4794 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
4795 // CHECK:   ret <4 x half> [[TMP5]]
test_vld1_lane_f16(float16_t const * a,float16x4_t b)4796 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
4797   return vld1_lane_f16(a, b, 3);
4798 }
4799 
4800 // CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
4801 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4802 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
4803 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
4804 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
4805 // CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
4806 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
4807 // CHECK:   ret <2 x float> [[VLD1_LANE]]
test_vld1_lane_f32(float32_t const * a,float32x2_t b)4808 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
4809   return vld1_lane_f32(a, b, 1);
4810 }
4811 
4812 // CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
4813 // CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4814 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4815 // CHECK:   ret <8 x i8> [[VLD1_LANE]]
test_vld1_lane_p8(poly8_t const * a,poly8x8_t b)4816 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
4817   return vld1_lane_p8(a, b, 7);
4818 }
4819 
4820 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
4821 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4822 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4823 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4824 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4825 // CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4826 // CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4827 // CHECK:   ret <4 x i16> [[VLD1_LANE]]
test_vld1_lane_p16(poly16_t const * a,poly16x4_t b)4828 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
4829   return vld1_lane_p16(a, b, 3);
4830 }
4831 
4832 
4833 // CHECK-LABEL: define void @test_vld2q_u8(%struct.uint8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
4834 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
4835 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
4836 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
4837 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
4838 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
4839 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
4840 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
4841 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
4842 // CHECK:   ret void
test_vld2q_u8(uint8_t const * a)4843 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
4844   return vld2q_u8(a);
4845 }
4846 
4847 // CHECK-LABEL: define void @test_vld2q_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
4848 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
4849 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
4850 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4851 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4852 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4853 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4854 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
4855 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
4856 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4857 // CHECK:   ret void
test_vld2q_u16(uint16_t const * a)4858 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
4859   return vld2q_u16(a);
4860 }
4861 
4862 // CHECK-LABEL: define void @test_vld2q_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
4863 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
4864 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
4865 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
4866 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
4867 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
4868 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
4869 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
4870 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
4871 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4872 // CHECK:   ret void
test_vld2q_u32(uint32_t const * a)4873 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
4874   return vld2q_u32(a);
4875 }
4876 
4877 // CHECK-LABEL: define void @test_vld2q_s8(%struct.int8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
4878 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
4879 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
4880 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
4881 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
4882 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
4883 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
4884 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
4885 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
4886 // CHECK:   ret void
test_vld2q_s8(int8_t const * a)4887 int8x16x2_t test_vld2q_s8(int8_t const * a) {
4888   return vld2q_s8(a);
4889 }
4890 
4891 // CHECK-LABEL: define void @test_vld2q_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
4892 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
4893 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
4894 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4895 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4896 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4897 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4898 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
4899 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
4900 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4901 // CHECK:   ret void
test_vld2q_s16(int16_t const * a)4902 int16x8x2_t test_vld2q_s16(int16_t const * a) {
4903   return vld2q_s16(a);
4904 }
4905 
4906 // CHECK-LABEL: define void @test_vld2q_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
4907 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
4908 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
4909 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
4910 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
4911 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
4912 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
4913 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
4914 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
4915 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4916 // CHECK:   ret void
test_vld2q_s32(int32_t const * a)4917 int32x4x2_t test_vld2q_s32(int32_t const * a) {
4918   return vld2q_s32(a);
4919 }
4920 
4921 // CHECK-LABEL: define void @test_vld2q_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a) #0 {
4922 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
4923 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
4924 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
4925 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4926 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4927 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4928 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
4929 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
4930 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4931 // CHECK:   ret void
test_vld2q_f16(float16_t const * a)4932 float16x8x2_t test_vld2q_f16(float16_t const * a) {
4933   return vld2q_f16(a);
4934 }
4935 
4936 // CHECK-LABEL: define void @test_vld2q_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a) #0 {
4937 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
4938 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
4939 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
4940 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* [[TMP1]], i32 4)
4941 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
4942 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_V]], { <4 x float>, <4 x float> }* [[TMP2]]
4943 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
4944 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
4945 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4946 // CHECK:   ret void
test_vld2q_f32(float32_t const * a)4947 float32x4x2_t test_vld2q_f32(float32_t const * a) {
4948   return vld2q_f32(a);
4949 }
4950 
4951 // CHECK-LABEL: define void @test_vld2q_p8(%struct.poly8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
4952 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
4953 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
4954 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
4955 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
4956 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
4957 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
4958 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
4959 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
4960 // CHECK:   ret void
test_vld2q_p8(poly8_t const * a)4961 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
4962   return vld2q_p8(a);
4963 }
4964 
4965 // CHECK-LABEL: define void @test_vld2q_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
4966 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
4967 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
4968 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4969 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4970 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4971 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4972 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
4973 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
4974 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4975 // CHECK:   ret void
test_vld2q_p16(poly16_t const * a)4976 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
4977   return vld2q_p16(a);
4978 }
4979 
4980 // CHECK-LABEL: define void @test_vld2_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
4981 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
4982 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
4983 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
4984 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
4985 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
4986 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
4987 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
4988 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
4989 // CHECK:   ret void
test_vld2_u8(uint8_t const * a)4990 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
4991   return vld2_u8(a);
4992 }
4993 
4994 // CHECK-LABEL: define void @test_vld2_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
4995 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
4996 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
4997 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4998 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
4999 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5000 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5001 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
5002 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5003 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5004 // CHECK:   ret void
test_vld2_u16(uint16_t const * a)5005 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
5006   return vld2_u16(a);
5007 }
5008 
5009 // CHECK-LABEL: define void @test_vld2_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5010 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
5011 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5012 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5013 // CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
5014 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5015 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
5016 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
5017 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5018 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5019 // CHECK:   ret void
test_vld2_u32(uint32_t const * a)5020 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
5021   return vld2_u32(a);
5022 }
5023 
5024 // CHECK-LABEL: define void @test_vld2_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5025 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
5026 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5027 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5028 // CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5029 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5030 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5031 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
5032 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5033 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5034 // CHECK:   ret void
test_vld2_u64(uint64_t const * a)5035 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
5036   return vld2_u64(a);
5037 }
5038 
5039 // CHECK-LABEL: define void @test_vld2_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5040 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
5041 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5042 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
5043 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5044 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
5045 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
5046 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5047 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
5048 // CHECK:   ret void
test_vld2_s8(int8_t const * a)5049 int8x8x2_t test_vld2_s8(int8_t const * a) {
5050   return vld2_s8(a);
5051 }
5052 
5053 // CHECK-LABEL: define void @test_vld2_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5054 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
5055 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5056 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5057 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
5058 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5059 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5060 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
5061 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5062 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5063 // CHECK:   ret void
test_vld2_s16(int16_t const * a)5064 int16x4x2_t test_vld2_s16(int16_t const * a) {
5065   return vld2_s16(a);
5066 }
5067 
5068 // CHECK-LABEL: define void @test_vld2_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5069 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
5070 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5071 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5072 // CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
5073 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5074 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
5075 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
5076 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5077 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5078 // CHECK:   ret void
test_vld2_s32(int32_t const * a)5079 int32x2x2_t test_vld2_s32(int32_t const * a) {
5080   return vld2_s32(a);
5081 }
5082 
5083 // CHECK-LABEL: define void @test_vld2_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5084 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
5085 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5086 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5087 // CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5088 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5089 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5090 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
5091 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5092 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5093 // CHECK:   ret void
test_vld2_s64(int64_t const * a)5094 int64x1x2_t test_vld2_s64(int64_t const * a) {
5095   return vld2_s64(a);
5096 }
5097 
5098 // CHECK-LABEL: define void @test_vld2_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
5099 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
5100 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5101 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
5102 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
5103 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5104 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5105 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
5106 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5107 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5108 // CHECK:   ret void
test_vld2_f16(float16_t const * a)5109 float16x4x2_t test_vld2_f16(float16_t const * a) {
5110   return vld2_f16(a);
5111 }
5112 
5113 // CHECK-LABEL: define void @test_vld2_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
5114 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
5115 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5116 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
5117 // CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0i8(i8* [[TMP1]], i32 4)
5118 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
5119 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_V]], { <2 x float>, <2 x float> }* [[TMP2]]
5120 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
5121 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5122 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5123 // CHECK:   ret void
test_vld2_f32(float32_t const * a)5124 float32x2x2_t test_vld2_f32(float32_t const * a) {
5125   return vld2_f32(a);
5126 }
5127 
5128 // CHECK-LABEL: define void @test_vld2_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5129 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
5130 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5131 // CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
5132 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5133 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
5134 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
5135 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5136 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
5137 // CHECK:   ret void
test_vld2_p8(poly8_t const * a)5138 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
5139   return vld2_p8(a);
5140 }
5141 
5142 // CHECK-LABEL: define void @test_vld2_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5143 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
5144 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5145 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5146 // CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
5147 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5148 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5149 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
5150 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5151 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5152 // CHECK:   ret void
test_vld2_p16(poly16_t const * a)5153 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
5154   return vld2_p16(a);
5155 }
5156 
5157 
5158 // CHECK-LABEL: define void @test_vld2_dup_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5159 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
5160 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5161 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
5162 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
5163 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
5164 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
5165 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
5166 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
5167 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
5168 // CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5169 // CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
5170 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
5171 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
5173 // CHECK:   ret void
test_vld2_dup_u8(uint8_t const * a)5174 uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
5175   return vld2_dup_u8(a);
5176 }
5177 
5178 // CHECK-LABEL: define void @test_vld2_dup_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5179 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
5180 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5181 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5182 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5183 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5184 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5185 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5186 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5187 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5188 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5189 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5190 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5191 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
5192 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5193 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5194 // CHECK:   ret void
test_vld2_dup_u16(uint16_t const * a)5195 uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
5196   return vld2_dup_u16(a);
5197 }
5198 
5199 // CHECK-LABEL: define void @test_vld2_dup_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5200 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
5201 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5202 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5203 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
5204 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
5205 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
5206 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
5207 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
5208 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
5209 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
5210 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5211 // CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
5212 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
5213 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5214 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5215 // CHECK:   ret void
test_vld2_dup_u32(uint32_t const * a)5216 uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
5217   return vld2_dup_u32(a);
5218 }
5219 
5220 // CHECK-LABEL: define void @test_vld2_dup_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5221 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
5222 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5223 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5224 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5225 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5226 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5227 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
5228 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5229 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5230 // CHECK:   ret void
test_vld2_dup_u64(uint64_t const * a)5231 uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
5232   return vld2_dup_u64(a);
5233 }
5234 
5235 // CHECK-LABEL: define void @test_vld2_dup_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5236 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
5237 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5238 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
5239 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
5240 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
5241 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
5242 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
5243 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
5244 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
5245 // CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5246 // CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
5247 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
5248 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5249 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
5250 // CHECK:   ret void
test_vld2_dup_s8(int8_t const * a)5251 int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
5252   return vld2_dup_s8(a);
5253 }
5254 
5255 // CHECK-LABEL: define void @test_vld2_dup_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5256 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
5257 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5258 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5259 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5260 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5261 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5262 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5263 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5264 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5265 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5266 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5267 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5268 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
5269 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5270 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5271 // CHECK:   ret void
test_vld2_dup_s16(int16_t const * a)5272 int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
5273   return vld2_dup_s16(a);
5274 }
5275 
5276 // CHECK-LABEL: define void @test_vld2_dup_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5277 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
5278 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5279 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5280 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
5281 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
5282 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
5283 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
5284 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
5285 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
5286 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
5287 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5288 // CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
5289 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
5290 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5291 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5292 // CHECK:   ret void
test_vld2_dup_s32(int32_t const * a)5293 int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
5294   return vld2_dup_s32(a);
5295 }
5296 
5297 // CHECK-LABEL: define void @test_vld2_dup_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5298 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
5299 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5300 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5301 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5302 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5303 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5304 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
5305 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5306 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5307 // CHECK:   ret void
test_vld2_dup_s64(int64_t const * a)5308 int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
5309   return vld2_dup_s64(a);
5310 }
5311 
5312 // CHECK-LABEL: define void @test_vld2_dup_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
5313 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
5314 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5315 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
5316 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5317 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5318 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5319 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5320 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5321 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5322 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5323 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5324 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5325 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
5326 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5327 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5328 // CHECK:   ret void
test_vld2_dup_f16(float16_t const * a)5329 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
5330   return vld2_dup_f16(a);
5331 }
5332 
5333 // CHECK-LABEL: define void @test_vld2_dup_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
5334 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
5335 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5336 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
5337 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, i32 0, i32 4)
5338 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD_DUP]], 0
5339 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
5340 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
5341 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
5342 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
5343 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
5344 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
5345 // CHECK:   store { <2 x float>, <2 x float> } [[TMP5]], { <2 x float>, <2 x float> }* [[TMP6]]
5346 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
5347 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5348 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5349 // CHECK:   ret void
test_vld2_dup_f32(float32_t const * a)5350 float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
5351   return vld2_dup_f32(a);
5352 }
5353 
5354 // CHECK-LABEL: define void @test_vld2_dup_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5355 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
5356 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5357 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
5358 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
5359 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
5360 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
5361 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
5362 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
5363 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
5364 // CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5365 // CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
5366 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
5367 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5368 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
5369 // CHECK:   ret void
test_vld2_dup_p8(poly8_t const * a)5370 poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
5371   return vld2_dup_p8(a);
5372 }
5373 
5374 // CHECK-LABEL: define void @test_vld2_dup_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5375 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
5376 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5377 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5378 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5379 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5380 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5381 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5382 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5383 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5384 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5385 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5386 // CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5387 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
5388 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5389 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5390 // CHECK:   ret void
test_vld2_dup_p16(poly16_t const * a)5391 poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
5392   return vld2_dup_p16(a);
5393 }
5394 
5395 
5396 // CHECK-LABEL: define void @test_vld2q_lane_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
5397 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
5398 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
5399 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
5400 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
5401 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
5402 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5403 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
5404 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
5405 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5406 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
5407 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5408 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
5409 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
5410 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
5411 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5412 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
5413 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
5414 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
5415 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5416 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5417 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5418 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5419 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5420 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5421 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
5422 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
5423 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5424 // CHECK:   ret void
test_vld2q_lane_u16(uint16_t const * a,uint16x8x2_t b)5425 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
5426   return vld2q_lane_u16(a, b, 7);
5427 }
5428 
5429 // CHECK-LABEL: define void @test_vld2q_lane_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
5430 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
5431 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
5432 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
5433 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
5434 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
5435 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5436 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
5437 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
5438 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5439 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
5440 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5441 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
5442 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
5443 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
5444 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5445 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
5446 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
5447 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
5448 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5449 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5450 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5451 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
5452 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
5453 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
5454 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
5455 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
5456 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5457 // CHECK:   ret void
test_vld2q_lane_u32(uint32_t const * a,uint32x4x2_t b)5458 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
5459   return vld2q_lane_u32(a, b, 3);
5460 }
5461 
5462 // CHECK-LABEL: define void @test_vld2q_lane_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
5463 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
5464 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
5465 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
5466 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
5467 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
5468 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5469 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
5470 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
5471 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5472 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
5473 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5474 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
5475 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
5476 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
5477 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5478 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
5479 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
5480 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
5481 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5482 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5483 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5484 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5485 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5486 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5487 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
5488 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
5489 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5490 // CHECK:   ret void
test_vld2q_lane_s16(int16_t const * a,int16x8x2_t b)5491 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
5492   return vld2q_lane_s16(a, b, 7);
5493 }
5494 
5495 // CHECK-LABEL: define void @test_vld2q_lane_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
5496 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
5497 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
5498 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
5499 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
5500 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
5501 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5502 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
5503 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
5504 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5505 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
5506 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5507 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
5508 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
5509 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
5510 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5511 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
5512 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
5513 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
5514 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5515 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5516 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5517 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
5518 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
5519 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
5520 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
5521 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
5522 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5523 // CHECK:   ret void
test_vld2q_lane_s32(int32_t const * a,int32x4x2_t b)5524 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
5525   return vld2q_lane_s32(a, b, 3);
5526 }
5527 
5528 // CHECK-LABEL: define void @test_vld2q_lane_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
5529 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
5530 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
5531 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
5532 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
5533 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
5534 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5535 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
5536 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
5537 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5538 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
5539 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
5540 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
5541 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
5542 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
5543 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5544 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
5545 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
5546 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
5547 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5548 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5549 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5550 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5551 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5552 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5553 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
5554 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
5555 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5556 // CHECK:   ret void
test_vld2q_lane_f16(float16_t const * a,float16x8x2_t b)5557 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
5558   return vld2q_lane_f16(a, b, 7);
5559 }
5560 
5561 // CHECK-LABEL: define void @test_vld2q_lane_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
5562 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
5563 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
5564 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
5565 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
5566 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
5567 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5568 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
5569 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
5570 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5571 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
5572 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
5573 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
5574 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
5575 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
5576 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5577 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
5578 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
5579 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
5580 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5581 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5582 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5583 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], i32 3, i32 4)
5584 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float> }*
5585 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], { <4 x float>, <4 x float> }* [[TMP11]]
5586 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
5587 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
5588 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5589 // CHECK:   ret void
test_vld2q_lane_f32(float32_t const * a,float32x4x2_t b)5590 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
5591   return vld2q_lane_f32(a, b, 3);
5592 }
5593 
5594 // CHECK-LABEL: define void @test_vld2q_lane_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
5595 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
5596 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
5597 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
5598 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
5599 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
5600 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5601 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
5602 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
5603 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5604 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
5605 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5606 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
5607 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
5608 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
5609 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5610 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
5611 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
5612 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
5613 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5614 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5615 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5616 // CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5617 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5618 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5619 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
5620 // CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
5621 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5622 // CHECK:   ret void
test_vld2q_lane_p16(poly16_t const * a,poly16x8x2_t b)5623 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
5624   return vld2q_lane_p16(a, b, 7);
5625 }
5626 
5627 // CHECK-LABEL: define void @test_vld2_lane_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
5628 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
5629 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
5630 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
5631 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
5632 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
5633 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5634 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
5635 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
5636 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5637 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5638 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
5639 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
5640 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
5641 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
5642 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
5643 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
5644 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
5645 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
5646 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
5647 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
5648 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5649 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5650 // CHECK:   ret void
test_vld2_lane_u8(uint8_t const * a,uint8x8x2_t b)5651 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
5652   return vld2_lane_u8(a, b, 7);
5653 }
5654 
5655 // CHECK-LABEL: define void @test_vld2_lane_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
5656 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
5657 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
5658 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
5659 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
5660 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
5661 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5662 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
5663 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
5664 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5665 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5666 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5667 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
5668 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
5669 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
5670 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5671 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
5672 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
5673 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
5674 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5675 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5676 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5677 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5678 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5679 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5680 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
5681 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5682 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5683 // CHECK:   ret void
test_vld2_lane_u16(uint16_t const * a,uint16x4x2_t b)5684 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
5685   return vld2_lane_u16(a, b, 3);
5686 }
5687 
5688 // CHECK-LABEL: define void @test_vld2_lane_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
5689 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
5690 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
5691 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
5692 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
5693 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
5694 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5695 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
5696 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
5697 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5698 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5699 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5700 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
5701 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
5702 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
5703 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5704 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
5705 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
5706 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
5707 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5708 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5709 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5710 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
5711 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
5712 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
5713 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
5714 // CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5715 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5716 // CHECK:   ret void
test_vld2_lane_u32(uint32_t const * a,uint32x2x2_t b)5717 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
5718   return vld2_lane_u32(a, b, 1);
5719 }
5720 
5721 // CHECK-LABEL: define void @test_vld2_lane_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
5722 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
5723 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
5724 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
5725 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
5726 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
5727 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5728 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
5729 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
5730 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5731 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5732 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
5733 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
5734 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
5735 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
5736 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
5737 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
5738 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
5739 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
5740 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
5741 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
5742 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5743 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5744 // CHECK:   ret void
test_vld2_lane_s8(int8_t const * a,int8x8x2_t b)5745 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
5746   return vld2_lane_s8(a, b, 7);
5747 }
5748 
5749 // CHECK-LABEL: define void @test_vld2_lane_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
5750 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
5751 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
5752 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
5753 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
5754 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
5755 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5756 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
5757 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
5758 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5759 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5760 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5761 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
5762 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
5763 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
5764 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5765 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
5766 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
5767 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
5768 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5769 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5770 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5771 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5772 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5773 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5774 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
5775 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5776 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5777 // CHECK:   ret void
test_vld2_lane_s16(int16_t const * a,int16x4x2_t b)5778 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
5779   return vld2_lane_s16(a, b, 3);
5780 }
5781 
5782 // CHECK-LABEL: define void @test_vld2_lane_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
5783 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
5784 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
5785 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
5786 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
5787 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
5788 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5789 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
5790 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
5791 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5792 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5793 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5794 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
5795 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
5796 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
5797 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5798 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
5799 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
5800 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
5801 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5802 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5803 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5804 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
5805 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
5806 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
5807 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
5808 // CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5809 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5810 // CHECK:   ret void
test_vld2_lane_s32(int32_t const * a,int32x2x2_t b)5811 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
5812   return vld2_lane_s32(a, b, 1);
5813 }
5814 
5815 // CHECK-LABEL: define void @test_vld2_lane_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a, [2 x i64] %b.coerce) #0 {
5816 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
5817 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
5818 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
5819 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
5820 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
5821 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5822 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
5823 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
5824 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5825 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5826 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
5827 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
5828 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
5829 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
5830 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5831 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
5832 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
5833 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
5834 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5835 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5836 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5837 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5838 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5839 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5840 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
5841 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5842 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5843 // CHECK:   ret void
test_vld2_lane_f16(float16_t const * a,float16x4x2_t b)5844 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
5845   return vld2_lane_f16(a, b, 3);
5846 }
5847 
5848 // CHECK-LABEL: define void @test_vld2_lane_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a, [2 x i64] %b.coerce) #0 {
5849 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
5850 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
5851 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
5852 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
5853 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
5854 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5855 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
5856 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
5857 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5858 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5859 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
5860 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
5861 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
5862 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
5863 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5864 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
5865 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
5866 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
5867 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5868 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5869 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5870 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], i32 1, i32 4)
5871 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float> }*
5872 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE_V]], { <2 x float>, <2 x float> }* [[TMP11]]
5873 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
5874 // CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5875 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5876 // CHECK:   ret void
test_vld2_lane_f32(float32_t const * a,float32x2x2_t b)5877 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
5878   return vld2_lane_f32(a, b, 1);
5879 }
5880 
5881 // CHECK-LABEL: define void @test_vld2_lane_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
5882 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
5883 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
5884 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
5885 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
5886 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
5887 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5888 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
5889 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
5890 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5891 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5892 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
5893 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
5894 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
5895 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
5896 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
5897 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
5898 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
5899 // CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
5900 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
5901 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
5902 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5903 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5904 // CHECK:   ret void
test_vld2_lane_p8(poly8_t const * a,poly8x8x2_t b)5905 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
5906   return vld2_lane_p8(a, b, 7);
5907 }
5908 
5909 // CHECK-LABEL: define void @test_vld2_lane_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
5910 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
5911 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
5912 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
5913 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
5914 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
5915 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5916 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
5917 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
5918 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5919 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5920 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5921 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
5922 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
5923 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
5924 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5925 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
5926 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
5927 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
5928 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5929 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5930 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5931 // CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5932 // CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5933 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5934 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
5935 // CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5936 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5937 // CHECK:   ret void
test_vld2_lane_p16(poly16_t const * a,poly16x4x2_t b)5938 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
5939   return vld2_lane_p16(a, b, 3);
5940 }
5941 
5942 
5943 // CHECK-LABEL: define void @test_vld3q_u8(%struct.uint8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
5944 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
5945 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
5946 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
5947 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
5948 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
5949 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* %agg.result to i8*
5950 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
5951 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
5952 // CHECK:   ret void
test_vld3q_u8(uint8_t const * a)5953 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
5954   return vld3q_u8(a);
5955 }
5956 
5957 // CHECK-LABEL: define void @test_vld3q_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
5958 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
5959 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
5960 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5961 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
5962 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
5963 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
5964 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
5965 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
5966 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
5967 // CHECK:   ret void
test_vld3q_u16(uint16_t const * a)5968 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
5969   return vld3q_u16(a);
5970 }
5971 
5972 // CHECK-LABEL: define void @test_vld3q_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
5973 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
5974 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
5975 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5976 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
5977 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
5978 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
5979 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
5980 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
5981 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
5982 // CHECK:   ret void
test_vld3q_u32(uint32_t const * a)5983 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
5984   return vld3q_u32(a);
5985 }
5986 
5987 // CHECK-LABEL: define void @test_vld3q_s8(%struct.int8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
5988 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
5989 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
5990 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
5991 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
5992 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
5993 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* %agg.result to i8*
5994 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
5995 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
5996 // CHECK:   ret void
test_vld3q_s8(int8_t const * a)5997 int8x16x3_t test_vld3q_s8(int8_t const * a) {
5998   return vld3q_s8(a);
5999 }
6000 
6001 // CHECK-LABEL: define void @test_vld3q_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
6002 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
6003 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6004 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6005 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
6006 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6007 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
6008 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
6009 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6010 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6011 // CHECK:   ret void
test_vld3q_s16(int16_t const * a)6012 int16x8x3_t test_vld3q_s16(int16_t const * a) {
6013   return vld3q_s16(a);
6014 }
6015 
6016 // CHECK-LABEL: define void @test_vld3q_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
6017 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
6018 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6019 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6020 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
6021 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
6022 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
6023 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
6024 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6025 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6026 // CHECK:   ret void
test_vld3q_s32(int32_t const * a)6027 int32x4x3_t test_vld3q_s32(int32_t const * a) {
6028   return vld3q_s32(a);
6029 }
6030 
6031 // CHECK-LABEL: define void @test_vld3q_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a) #0 {
6032 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
6033 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6034 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
6035 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
6036 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6037 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
6038 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
6039 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6040 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6041 // CHECK:   ret void
test_vld3q_f16(float16_t const * a)6042 float16x8x3_t test_vld3q_f16(float16_t const * a) {
6043   return vld3q_f16(a);
6044 }
6045 
6046 // CHECK-LABEL: define void @test_vld3q_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a) #0 {
6047 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
6048 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6049 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
6050 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0i8(i8* [[TMP1]], i32 4)
6051 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
6052 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
6053 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
6054 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6055 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6056 // CHECK:   ret void
test_vld3q_f32(float32_t const * a)6057 float32x4x3_t test_vld3q_f32(float32_t const * a) {
6058   return vld3q_f32(a);
6059 }
6060 
6061 // CHECK-LABEL: define void @test_vld3q_p8(%struct.poly8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
6062 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
6063 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
6064 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
6065 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
6066 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
6067 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* %agg.result to i8*
6068 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
6069 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
6070 // CHECK:   ret void
test_vld3q_p8(poly8_t const * a)6071 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
6072   return vld3q_p8(a);
6073 }
6074 
6075 // CHECK-LABEL: define void @test_vld3q_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
6076 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
6077 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6078 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6079 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
6080 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6081 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
6082 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
6083 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6084 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6085 // CHECK:   ret void
test_vld3q_p16(poly16_t const * a)6086 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
6087   return vld3q_p16(a);
6088 }
6089 
6090 // CHECK-LABEL: define void @test_vld3_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6091 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
6092 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6093 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
6094 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6095 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
6096 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
6097 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6098 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
6099 // CHECK:   ret void
test_vld3_u8(uint8_t const * a)6100 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
6101   return vld3_u8(a);
6102 }
6103 
6104 // CHECK-LABEL: define void @test_vld3_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6105 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
6106 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6107 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6108 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6109 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6110 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6111 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
6112 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6113 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6114 // CHECK:   ret void
test_vld3_u16(uint16_t const * a)6115 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
6116   return vld3_u16(a);
6117 }
6118 
6119 // CHECK-LABEL: define void @test_vld3_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6120 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
6121 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6122 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6123 // CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
6124 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6125 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
6126 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
6127 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6128 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6129 // CHECK:   ret void
test_vld3_u32(uint32_t const * a)6130 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
6131   return vld3_u32(a);
6132 }
6133 
6134 // CHECK-LABEL: define void @test_vld3_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6135 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
6136 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6137 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6138 // CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6139 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6140 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6141 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
6142 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6143 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6144 // CHECK:   ret void
test_vld3_u64(uint64_t const * a)6145 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
6146   return vld3_u64(a);
6147 }
6148 
6149 // CHECK-LABEL: define void @test_vld3_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6150 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
6151 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6152 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
6153 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6154 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
6155 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
6156 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6157 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
6158 // CHECK:   ret void
test_vld3_s8(int8_t const * a)6159 int8x8x3_t test_vld3_s8(int8_t const * a) {
6160   return vld3_s8(a);
6161 }
6162 
6163 // CHECK-LABEL: define void @test_vld3_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6164 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
6165 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6166 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6167 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6168 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6169 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6170 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
6171 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6173 // CHECK:   ret void
test_vld3_s16(int16_t const * a)6174 int16x4x3_t test_vld3_s16(int16_t const * a) {
6175   return vld3_s16(a);
6176 }
6177 
6178 // CHECK-LABEL: define void @test_vld3_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6179 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
6180 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6181 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6182 // CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
6183 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6184 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
6185 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
6186 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6187 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6188 // CHECK:   ret void
test_vld3_s32(int32_t const * a)6189 int32x2x3_t test_vld3_s32(int32_t const * a) {
6190   return vld3_s32(a);
6191 }
6192 
6193 // CHECK-LABEL: define void @test_vld3_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6194 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
6195 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6196 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6197 // CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6198 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6199 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6200 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
6201 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6202 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6203 // CHECK:   ret void
test_vld3_s64(int64_t const * a)6204 int64x1x3_t test_vld3_s64(int64_t const * a) {
6205   return vld3_s64(a);
6206 }
6207 
6208 // CHECK-LABEL: define void @test_vld3_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
6209 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
6210 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6211 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
6212 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6213 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6214 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6215 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
6216 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6217 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6218 // CHECK:   ret void
test_vld3_f16(float16_t const * a)6219 float16x4x3_t test_vld3_f16(float16_t const * a) {
6220   return vld3_f16(a);
6221 }
6222 
6223 // CHECK-LABEL: define void @test_vld3_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
6224 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
6225 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6226 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
6227 // CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0i8(i8* [[TMP1]], i32 4)
6228 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
6229 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
6230 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
6231 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6232 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6233 // CHECK:   ret void
test_vld3_f32(float32_t const * a)6234 float32x2x3_t test_vld3_f32(float32_t const * a) {
6235   return vld3_f32(a);
6236 }
6237 
6238 // CHECK-LABEL: define void @test_vld3_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6239 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
6240 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6241 // CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
6242 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6243 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
6244 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
6245 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6246 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
6247 // CHECK:   ret void
test_vld3_p8(poly8_t const * a)6248 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
6249   return vld3_p8(a);
6250 }
6251 
6252 // CHECK-LABEL: define void @test_vld3_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6253 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
6254 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6255 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6256 // CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6257 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6258 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6259 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
6260 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6261 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6262 // CHECK:   ret void
test_vld3_p16(poly16_t const * a)6263 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
6264   return vld3_p16(a);
6265 }
6266 
6267 
6268 // CHECK-LABEL: define void @test_vld3_dup_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6269 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
6270 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6271 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
6272 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
6273 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
6274 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
6275 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
6276 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
6277 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
6278 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
6279 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
6280 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
6281 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6282 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6283 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
6284 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6285 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6286 // CHECK:   ret void
test_vld3_dup_u8(uint8_t const * a)6287 uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
6288   return vld3_dup_u8(a);
6289 }
6290 
6291 // CHECK-LABEL: define void @test_vld3_dup_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6292 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
6293 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6294 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6295 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6296 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6297 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6298 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6299 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6300 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6301 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6302 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6303 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6304 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6305 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6306 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6307 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
6308 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6309 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6310 // CHECK:   ret void
test_vld3_dup_u16(uint16_t const * a)6311 uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
6312   return vld3_dup_u16(a);
6313 }
6314 
6315 // CHECK-LABEL: define void @test_vld3_dup_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6316 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
6317 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6318 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6319 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
6320 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
6321 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
6322 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
6323 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
6324 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
6325 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
6326 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
6327 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
6328 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
6329 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6330 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
6331 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
6332 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6333 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6334 // CHECK:   ret void
test_vld3_dup_u32(uint32_t const * a)6335 uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
6336   return vld3_dup_u32(a);
6337 }
6338 
6339 // CHECK-LABEL: define void @test_vld3_dup_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6340 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
6341 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6342 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6343 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6344 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6345 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6346 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
6347 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6348 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6349 // CHECK:   ret void
test_vld3_dup_u64(uint64_t const * a)6350 uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
6351   return vld3_dup_u64(a);
6352 }
6353 
6354 // CHECK-LABEL: define void @test_vld3_dup_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6355 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
6356 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6357 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
6358 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
6359 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
6360 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
6361 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
6362 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
6363 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
6364 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
6365 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
6366 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
6367 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6368 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6369 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
6370 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6371 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6372 // CHECK:   ret void
test_vld3_dup_s8(int8_t const * a)6373 int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
6374   return vld3_dup_s8(a);
6375 }
6376 
6377 // CHECK-LABEL: define void @test_vld3_dup_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6378 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
6379 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6380 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6381 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6382 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6383 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6384 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6385 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6386 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6387 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6388 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6389 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6390 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6391 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6392 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6393 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
6394 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6395 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6396 // CHECK:   ret void
test_vld3_dup_s16(int16_t const * a)6397 int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
6398   return vld3_dup_s16(a);
6399 }
6400 
6401 // CHECK-LABEL: define void @test_vld3_dup_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6402 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
6403 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6404 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6405 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
6406 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
6407 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
6408 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
6409 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
6410 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
6411 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
6412 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
6413 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
6414 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
6415 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6416 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
6417 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
6418 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6419 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6420 // CHECK:   ret void
test_vld3_dup_s32(int32_t const * a)6421 int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
6422   return vld3_dup_s32(a);
6423 }
6424 
6425 // CHECK-LABEL: define void @test_vld3_dup_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6426 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
6427 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6428 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6429 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6430 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6431 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6432 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
6433 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6434 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6435 // CHECK:   ret void
test_vld3_dup_s64(int64_t const * a)6436 int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
6437   return vld3_dup_s64(a);
6438 }
6439 
6440 // CHECK-LABEL: define void @test_vld3_dup_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
6441 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
6442 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6443 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
6444 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6445 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6446 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6447 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6448 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6449 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6450 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6451 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6452 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6453 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6454 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6455 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6456 // CHECK:   [[TMP9:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
6457 // CHECK:   [[TMP10:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6458 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6459 // CHECK:   ret void
test_vld3_dup_f16(float16_t const * a)6460 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
6461   return vld3_dup_f16(a);
6462 }
6463 
6464 // CHECK-LABEL: define void @test_vld3_dup_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
6465 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
6466 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6467 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
6468 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
6469 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
6470 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
6471 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
6472 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
6473 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
6474 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
6475 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
6476 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
6477 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
6478 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
6479 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[TMP7]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP8]]
6480 // CHECK:   [[TMP9:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
6481 // CHECK:   [[TMP10:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6482 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6483 // CHECK:   ret void
test_vld3_dup_f32(float32_t const * a)6484 float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
6485   return vld3_dup_f32(a);
6486 }
6487 
6488 // CHECK-LABEL: define void @test_vld3_dup_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6489 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
6490 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6491 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
6492 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
6493 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
6494 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
6495 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
6496 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
6497 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
6498 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
6499 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
6500 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
6501 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6502 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6503 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
6504 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6505 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6506 // CHECK:   ret void
test_vld3_dup_p8(poly8_t const * a)6507 poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
6508   return vld3_dup_p8(a);
6509 }
6510 
6511 // CHECK-LABEL: define void @test_vld3_dup_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6512 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
6513 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6514 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6515 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6516 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6517 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6518 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6519 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6520 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6521 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6522 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6523 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6524 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6525 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6526 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6527 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
6528 // CHECK:   [[TMP10:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6529 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6530 // CHECK:   ret void
test_vld3_dup_p16(poly16_t const * a)6531 poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
6532   return vld3_dup_p16(a);
6533 }
6534 
6535 
6536 // CHECK-LABEL: define void @test_vld3q_lane_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
6537 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
6538 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
6539 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
6540 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
6541 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
6542 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6543 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
6544 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
6545 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6546 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
6547 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6548 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
6549 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
6550 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
6551 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6552 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
6553 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
6554 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
6555 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6556 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
6557 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
6558 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
6559 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6560 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6561 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6562 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6563 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6564 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6565 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6566 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
6567 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
6568 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6569 // CHECK:   ret void
test_vld3q_lane_u16(uint16_t const * a,uint16x8x3_t b)6570 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
6571   return vld3q_lane_u16(a, b, 7);
6572 }
6573 
6574 // CHECK-LABEL: define void @test_vld3q_lane_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
6575 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
6576 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
6577 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
6578 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
6579 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
6580 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6581 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
6582 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
6583 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6584 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
6585 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6586 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
6587 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
6588 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
6589 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
6590 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
6591 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
6592 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
6593 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
6594 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
6595 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
6596 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
6597 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
6598 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
6599 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
6600 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
6601 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
6602 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
6603 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
6604 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
6605 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
6606 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6607 // CHECK:   ret void
test_vld3q_lane_u32(uint32_t const * a,uint32x4x3_t b)6608 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
6609   return vld3q_lane_u32(a, b, 3);
6610 }
6611 
6612 // CHECK-LABEL: define void @test_vld3q_lane_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
6613 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
6614 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
6615 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
6616 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
6617 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
6618 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6619 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
6620 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
6621 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6622 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6623 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6624 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
6625 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
6626 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
6627 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6628 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
6629 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
6630 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
6631 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6632 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
6633 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
6634 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
6635 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6636 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6637 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6638 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6639 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6640 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6641 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6642 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
6643 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6644 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6645 // CHECK:   ret void
test_vld3q_lane_s16(int16_t const * a,int16x8x3_t b)6646 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
6647   return vld3q_lane_s16(a, b, 7);
6648 }
6649 
6650 // CHECK-LABEL: define void @test_vld3q_lane_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
6651 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
6652 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
6653 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
6654 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
6655 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
6656 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6657 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
6658 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
6659 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6660 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6661 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6662 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
6663 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
6664 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
6665 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
6666 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
6667 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
6668 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
6669 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
6670 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
6671 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
6672 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
6673 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
6674 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
6675 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
6676 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
6677 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
6678 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
6679 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
6680 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
6681 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6682 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6683 // CHECK:   ret void
test_vld3q_lane_s32(int32_t const * a,int32x4x3_t b)6684 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
6685   return vld3q_lane_s32(a, b, 3);
6686 }
6687 
6688 // CHECK-LABEL: define void @test_vld3q_lane_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a, [6 x i64] %b.coerce) #0 {
6689 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
6690 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
6691 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
6692 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
6693 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
6694 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6695 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
6696 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
6697 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6698 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6699 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
6700 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
6701 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
6702 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
6703 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
6704 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
6705 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
6706 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
6707 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
6708 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
6709 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
6710 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
6711 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
6712 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6713 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6714 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6715 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6716 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6717 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6718 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
6719 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6720 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6721 // CHECK:   ret void
test_vld3q_lane_f16(float16_t const * a,float16x8x3_t b)6722 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
6723   return vld3q_lane_f16(a, b, 7);
6724 }
6725 
6726 // CHECK-LABEL: define void @test_vld3q_lane_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a, [6 x i64] %b.coerce) #0 {
6727 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
6728 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
6729 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
6730 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
6731 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
6732 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6733 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
6734 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
6735 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6736 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6737 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
6738 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
6739 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
6740 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
6741 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
6742 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
6743 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
6744 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
6745 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
6746 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
6747 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
6748 // CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
6749 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
6750 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
6751 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
6752 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
6753 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], i32 3, i32 4)
6754 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float> }*
6755 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP14]]
6756 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
6757 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6758 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6759 // CHECK:   ret void
test_vld3q_lane_f32(float32_t const * a,float32x4x3_t b)6760 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
6761   return vld3q_lane_f32(a, b, 3);
6762 }
6763 
6764 // CHECK-LABEL: define void @test_vld3q_lane_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
6765 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
6766 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
6767 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
6768 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
6769 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
6770 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6771 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
6772 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
6773 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6774 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6775 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6776 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
6777 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
6778 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
6779 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6780 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
6781 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
6782 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
6783 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6784 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
6785 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
6786 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
6787 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6788 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6789 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6790 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6791 // CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6792 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6793 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6794 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
6795 // CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6796 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6797 // CHECK:   ret void
test_vld3q_lane_p16(poly16_t const * a,poly16x8x3_t b)6798 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
6799   return vld3q_lane_p16(a, b, 7);
6800 }
6801 
6802 // CHECK-LABEL: define void @test_vld3_lane_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
6803 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
6804 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
6805 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
6806 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
6807 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
6808 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6809 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
6810 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
6811 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6812 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6813 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
6814 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
6815 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
6816 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
6817 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
6818 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
6819 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
6820 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
6821 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
6822 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
6823 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6824 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6825 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
6826 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6827 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6828 // CHECK:   ret void
test_vld3_lane_u8(uint8_t const * a,uint8x8x3_t b)6829 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
6830   return vld3_lane_u8(a, b, 7);
6831 }
6832 
6833 // CHECK-LABEL: define void @test_vld3_lane_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
6834 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
6835 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
6836 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
6837 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
6838 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
6839 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6840 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
6841 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
6842 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6843 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6844 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6845 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
6846 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
6847 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
6848 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6849 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
6850 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
6851 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
6852 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6853 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
6854 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
6855 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
6856 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6857 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6858 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6859 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6860 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
6861 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6862 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
6863 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
6864 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6865 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
6866 // CHECK:   ret void
test_vld3_lane_u16(uint16_t const * a,uint16x4x3_t b)6867 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
6868   return vld3_lane_u16(a, b, 3);
6869 }
6870 
6871 // CHECK-LABEL: define void @test_vld3_lane_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
6872 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
6873 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
6874 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
6875 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
6876 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
6877 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6878 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
6879 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
6880 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6881 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6882 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6883 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
6884 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
6885 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
6886 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6887 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
6888 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
6889 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
6890 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6891 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
6892 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
6893 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
6894 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6895 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6896 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6897 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6898 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
6899 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6900 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
6901 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
6902 // CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6903 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
6904 // CHECK:   ret void
test_vld3_lane_u32(uint32_t const * a,uint32x2x3_t b)6905 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
6906   return vld3_lane_u32(a, b, 1);
6907 }
6908 
6909 // CHECK-LABEL: define void @test_vld3_lane_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
6910 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
6911 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
6912 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
6913 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
6914 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
6915 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6916 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
6917 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
6918 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6919 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6920 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
6921 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
6922 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
6923 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
6924 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
6925 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
6926 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
6927 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
6928 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
6929 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
6930 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6931 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6932 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
6933 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6934 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6935 // CHECK:   ret void
test_vld3_lane_s8(int8_t const * a,int8x8x3_t b)6936 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
6937   return vld3_lane_s8(a, b, 7);
6938 }
6939 
6940 // CHECK-LABEL: define void @test_vld3_lane_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
6941 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
6942 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
6943 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
6944 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
6945 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
6946 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6947 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
6948 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
6949 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6950 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6951 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6952 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
6953 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
6954 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
6955 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6956 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
6957 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
6958 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
6959 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6960 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
6961 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
6962 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
6963 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6964 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6965 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6966 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6967 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
6968 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6969 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
6970 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
6971 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6972 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
6973 // CHECK:   ret void
test_vld3_lane_s16(int16_t const * a,int16x4x3_t b)6974 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
6975   return vld3_lane_s16(a, b, 3);
6976 }
6977 
6978 // CHECK-LABEL: define void @test_vld3_lane_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
6979 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
6980 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
6981 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
6982 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
6983 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
6984 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6985 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
6986 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
6987 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6988 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6989 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6990 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
6991 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
6992 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
6993 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6994 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
6995 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
6996 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
6997 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6998 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
6999 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
7000 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
7001 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
7002 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
7003 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
7004 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
7005 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
7006 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
7007 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
7008 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
7009 // CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
7010 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7011 // CHECK:   ret void
test_vld3_lane_s32(int32_t const * a,int32x2x3_t b)7012 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
7013   return vld3_lane_s32(a, b, 1);
7014 }
7015 
7016 // CHECK-LABEL: define void @test_vld3_lane_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a, [3 x i64] %b.coerce) #0 {
7017 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
7018 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
7019 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
7020 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
7021 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
7022 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7023 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
7024 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
7025 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7026 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
7027 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
7028 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
7029 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
7030 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
7031 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
7032 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
7033 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
7034 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
7035 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
7036 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
7037 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
7038 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
7039 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
7040 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
7041 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
7042 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
7043 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
7044 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
7045 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
7046 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
7047 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
7048 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7049 // CHECK:   ret void
test_vld3_lane_f16(float16_t const * a,float16x4x3_t b)7050 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
7051   return vld3_lane_f16(a, b, 3);
7052 }
7053 
7054 // CHECK-LABEL: define void @test_vld3_lane_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a, [3 x i64] %b.coerce) #0 {
7055 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
7056 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
7057 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
7058 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
7059 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
7060 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7061 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
7062 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
7063 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7064 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
7065 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
7066 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
7067 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
7068 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
7069 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
7070 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
7071 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
7072 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
7073 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
7074 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
7075 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
7076 // CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
7077 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
7078 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
7079 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
7080 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
7081 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], i32 1, i32 4)
7082 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float> }*
7083 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP14]]
7084 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
7085 // CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
7086 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7087 // CHECK:   ret void
test_vld3_lane_f32(float32_t const * a,float32x2x3_t b)7088 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
7089   return vld3_lane_f32(a, b, 1);
7090 }
7091 
7092 // CHECK-LABEL: define void @test_vld3_lane_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
7093 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
7094 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
7095 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
7096 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
7097 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
7098 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7099 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
7100 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
7101 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7102 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
7103 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
7104 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
7105 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
7106 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
7107 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
7108 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
7109 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
7110 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
7111 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
7112 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
7113 // CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
7114 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
7115 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
7116 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
7117 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
7118 // CHECK:   ret void
test_vld3_lane_p8(poly8_t const * a,poly8x8x3_t b)7119 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
7120   return vld3_lane_p8(a, b, 7);
7121 }
7122 
7123 // CHECK-LABEL: define void @test_vld3_lane_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
7124 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
7125 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
7126 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
7127 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
7128 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
7129 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7130 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
7131 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
7132 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7133 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
7134 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
7135 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
7136 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
7137 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
7138 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
7139 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
7140 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
7141 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
7142 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
7143 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
7144 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
7145 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
7146 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
7147 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
7148 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
7149 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
7150 // CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
7151 // CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
7152 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
7153 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
7154 // CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
7155 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7156 // CHECK:   ret void
test_vld3_lane_p16(poly16_t const * a,poly16x4x3_t b)7157 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
7158   return vld3_lane_p16(a, b, 3);
7159 }
7160 
7161 
7162 // CHECK-LABEL: define void @test_vld4q_u8(%struct.uint8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
7163 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
7164 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
7165 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
7166 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
7167 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
7168 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* %agg.result to i8*
7169 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
7170 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
7171 // CHECK:   ret void
test_vld4q_u8(uint8_t const * a)7172 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
7173   return vld4q_u8(a);
7174 }
7175 
7176 // CHECK-LABEL: define void @test_vld4q_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
7177 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
7178 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7179 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7180 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7181 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7182 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7183 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
7184 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7185 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7186 // CHECK:   ret void
test_vld4q_u16(uint16_t const * a)7187 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
7188   return vld4q_u16(a);
7189 }
7190 
7191 // CHECK-LABEL: define void @test_vld4q_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
7192 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
7193 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7194 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7195 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
7196 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7197 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
7198 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
7199 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7200 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7201 // CHECK:   ret void
test_vld4q_u32(uint32_t const * a)7202 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
7203   return vld4q_u32(a);
7204 }
7205 
7206 // CHECK-LABEL: define void @test_vld4q_s8(%struct.int8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
7207 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
7208 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
7209 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
7210 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
7211 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
7212 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* %agg.result to i8*
7213 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
7214 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
7215 // CHECK:   ret void
test_vld4q_s8(int8_t const * a)7216 int8x16x4_t test_vld4q_s8(int8_t const * a) {
7217   return vld4q_s8(a);
7218 }
7219 
7220 // CHECK-LABEL: define void @test_vld4q_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
7221 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
7222 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7223 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7224 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7225 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7226 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7227 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
7228 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7229 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7230 // CHECK:   ret void
test_vld4q_s16(int16_t const * a)7231 int16x8x4_t test_vld4q_s16(int16_t const * a) {
7232   return vld4q_s16(a);
7233 }
7234 
7235 // CHECK-LABEL: define void @test_vld4q_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
7236 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
7237 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7238 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7239 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
7240 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7241 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
7242 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
7243 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7244 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7245 // CHECK:   ret void
test_vld4q_s32(int32_t const * a)7246 int32x4x4_t test_vld4q_s32(int32_t const * a) {
7247   return vld4q_s32(a);
7248 }
7249 
7250 // CHECK-LABEL: define void @test_vld4q_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a) #0 {
7251 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
7252 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7253 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
7254 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7255 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7256 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7257 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
7258 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7259 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7260 // CHECK:   ret void
test_vld4q_f16(float16_t const * a)7261 float16x8x4_t test_vld4q_f16(float16_t const * a) {
7262   return vld4q_f16(a);
7263 }
7264 
7265 // CHECK-LABEL: define void @test_vld4q_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a) #0 {
7266 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
7267 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
7268 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
7269 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* [[TMP1]], i32 4)
7270 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
7271 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
7272 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
7273 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
7274 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7275 // CHECK:   ret void
test_vld4q_f32(float32_t const * a)7276 float32x4x4_t test_vld4q_f32(float32_t const * a) {
7277   return vld4q_f32(a);
7278 }
7279 
7280 // CHECK-LABEL: define void @test_vld4q_p8(%struct.poly8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
7281 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
7282 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
7283 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
7284 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
7285 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
7286 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* %agg.result to i8*
7287 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
7288 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
7289 // CHECK:   ret void
test_vld4q_p8(poly8_t const * a)7290 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
7291   return vld4q_p8(a);
7292 }
7293 
7294 // CHECK-LABEL: define void @test_vld4q_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
7295 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
7296 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
7297 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7298 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7299 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7300 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7301 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
7302 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
7303 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7304 // CHECK:   ret void
test_vld4q_p16(poly16_t const * a)7305 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
7306   return vld4q_p16(a);
7307 }
7308 
7309 // CHECK-LABEL: define void @test_vld4_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7310 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
7311 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7312 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
7313 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7314 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
7315 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
7316 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7317 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
7318 // CHECK:   ret void
test_vld4_u8(uint8_t const * a)7319 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
7320   return vld4_u8(a);
7321 }
7322 
7323 // CHECK-LABEL: define void @test_vld4_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7324 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
7325 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7326 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7327 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7328 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7329 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7330 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
7331 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7332 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7333 // CHECK:   ret void
test_vld4_u16(uint16_t const * a)7334 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
7335   return vld4_u16(a);
7336 }
7337 
7338 // CHECK-LABEL: define void @test_vld4_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7339 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
7340 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7341 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7342 // CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
7343 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7344 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
7345 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
7346 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7347 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7348 // CHECK:   ret void
test_vld4_u32(uint32_t const * a)7349 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
7350   return vld4_u32(a);
7351 }
7352 
7353 // CHECK-LABEL: define void @test_vld4_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7354 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
7355 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7356 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7357 // CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7358 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7359 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7360 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
7361 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7362 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7363 // CHECK:   ret void
test_vld4_u64(uint64_t const * a)7364 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
7365   return vld4_u64(a);
7366 }
7367 
7368 // CHECK-LABEL: define void @test_vld4_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7369 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
7370 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7371 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
7372 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7373 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
7374 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
7375 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7376 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
7377 // CHECK:   ret void
test_vld4_s8(int8_t const * a)7378 int8x8x4_t test_vld4_s8(int8_t const * a) {
7379   return vld4_s8(a);
7380 }
7381 
7382 // CHECK-LABEL: define void @test_vld4_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7383 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
7384 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7385 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7386 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7387 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7388 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7389 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
7390 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7391 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7392 // CHECK:   ret void
test_vld4_s16(int16_t const * a)7393 int16x4x4_t test_vld4_s16(int16_t const * a) {
7394   return vld4_s16(a);
7395 }
7396 
7397 // CHECK-LABEL: define void @test_vld4_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7398 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
7399 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7400 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7401 // CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
7402 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7403 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
7404 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
7405 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7406 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7407 // CHECK:   ret void
test_vld4_s32(int32_t const * a)7408 int32x2x4_t test_vld4_s32(int32_t const * a) {
7409   return vld4_s32(a);
7410 }
7411 
7412 // CHECK-LABEL: define void @test_vld4_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7413 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
7414 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7415 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7416 // CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7417 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7418 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7419 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
7420 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7421 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7422 // CHECK:   ret void
test_vld4_s64(int64_t const * a)7423 int64x1x4_t test_vld4_s64(int64_t const * a) {
7424   return vld4_s64(a);
7425 }
7426 
7427 // CHECK-LABEL: define void @test_vld4_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
7428 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
7429 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7430 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
7431 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7432 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7433 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7434 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
7435 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7436 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7437 // CHECK:   ret void
test_vld4_f16(float16_t const * a)7438 float16x4x4_t test_vld4_f16(float16_t const * a) {
7439   return vld4_f16(a);
7440 }
7441 
7442 // CHECK-LABEL: define void @test_vld4_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
7443 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
7444 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7445 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
7446 // CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0i8(i8* [[TMP1]], i32 4)
7447 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
7448 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
7449 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
7450 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7451 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7452 // CHECK:   ret void
test_vld4_f32(float32_t const * a)7453 float32x2x4_t test_vld4_f32(float32_t const * a) {
7454   return vld4_f32(a);
7455 }
7456 
7457 // CHECK-LABEL: define void @test_vld4_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7458 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
7459 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7460 // CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
7461 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7462 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
7463 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
7464 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7465 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
7466 // CHECK:   ret void
test_vld4_p8(poly8_t const * a)7467 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
7468   return vld4_p8(a);
7469 }
7470 
7471 // CHECK-LABEL: define void @test_vld4_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7472 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
7473 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7474 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7475 // CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7476 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7477 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7478 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
7479 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7480 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7481 // CHECK:   ret void
test_vld4_p16(poly16_t const * a)7482 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
7483   return vld4_p16(a);
7484 }
7485 
7486 
7487 // CHECK-LABEL: define void @test_vld4_dup_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7488 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
7489 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7490 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
7491 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
7492 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
7493 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
7494 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
7495 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
7496 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
7497 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
7498 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
7499 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
7500 // CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
7501 // CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
7502 // CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
7503 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7504 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
7505 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
7506 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7507 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
7508 // CHECK:   ret void
test_vld4_dup_u8(uint8_t const * a)7509 uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
7510   return vld4_dup_u8(a);
7511 }
7512 
7513 // CHECK-LABEL: define void @test_vld4_dup_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7514 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
7515 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7516 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7517 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7518 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7519 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7520 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7521 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7522 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7523 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7524 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7525 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7526 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7527 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7528 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7529 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7530 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7531 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7532 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
7533 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7534 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7535 // CHECK:   ret void
test_vld4_dup_u16(uint16_t const * a)7536 uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
7537   return vld4_dup_u16(a);
7538 }
7539 
7540 // CHECK-LABEL: define void @test_vld4_dup_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7541 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
7542 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7543 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7544 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
7545 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
7546 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
7547 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
7548 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
7549 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
7550 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
7551 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
7552 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
7553 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
7554 // CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
7555 // CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
7556 // CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
7557 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7558 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
7559 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
7560 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7561 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7562 // CHECK:   ret void
test_vld4_dup_u32(uint32_t const * a)7563 uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
7564   return vld4_dup_u32(a);
7565 }
7566 
7567 // CHECK-LABEL: define void @test_vld4_dup_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7568 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
7569 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7570 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7571 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7572 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7573 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7574 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
7575 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7576 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7577 // CHECK:   ret void
test_vld4_dup_u64(uint64_t const * a)7578 uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
7579   return vld4_dup_u64(a);
7580 }
7581 
7582 // CHECK-LABEL: define void @test_vld4_dup_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7583 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
7584 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7585 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
7586 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
7587 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
7588 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
7589 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
7590 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
7591 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
7592 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
7593 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
7594 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
7595 // CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
7596 // CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
7597 // CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
7598 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7599 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
7600 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
7601 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7602 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
7603 // CHECK:   ret void
test_vld4_dup_s8(int8_t const * a)7604 int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
7605   return vld4_dup_s8(a);
7606 }
7607 
7608 // CHECK-LABEL: define void @test_vld4_dup_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7609 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
7610 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7611 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7612 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7613 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7614 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7615 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7616 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7617 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7618 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7619 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7620 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7621 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7622 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7623 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7624 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7625 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7626 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7627 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
7628 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7629 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7630 // CHECK:   ret void
test_vld4_dup_s16(int16_t const * a)7631 int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
7632   return vld4_dup_s16(a);
7633 }
7634 
7635 // CHECK-LABEL: define void @test_vld4_dup_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7636 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
7637 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7638 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7639 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
7640 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
7641 // CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
7642 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
7643 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
7644 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
7645 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
7646 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
7647 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
7648 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
7649 // CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
7650 // CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
7651 // CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
7652 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7653 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
7654 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
7655 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7656 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7657 // CHECK:   ret void
test_vld4_dup_s32(int32_t const * a)7658 int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
7659   return vld4_dup_s32(a);
7660 }
7661 
7662 // CHECK-LABEL: define void @test_vld4_dup_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7663 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
7664 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7665 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7666 // CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7667 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7668 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7669 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
7670 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7671 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7672 // CHECK:   ret void
test_vld4_dup_s64(int64_t const * a)7673 int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
7674   return vld4_dup_s64(a);
7675 }
7676 
7677 // CHECK-LABEL: define void @test_vld4_dup_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
7678 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
7679 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7680 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
7681 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7682 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7683 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7684 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7685 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7686 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7687 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7688 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7689 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7690 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7691 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7692 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7693 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7694 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7695 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7696 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
7697 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7698 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7699 // CHECK:   ret void
test_vld4_dup_f16(float16_t const * a)7700 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
7701   return vld4_dup_f16(a);
7702 }
7703 
7704 // CHECK-LABEL: define void @test_vld4_dup_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
7705 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
7706 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7707 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
7708 // CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
7709 // CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
7710 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
7711 // CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
7712 // CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
7713 // CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
7714 // CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
7715 // CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
7716 // CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
7717 // CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
7718 // CHECK:   [[TMP8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], 3
7719 // CHECK:   [[LANE3:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP8]], <2 x i32> zeroinitializer
7720 // CHECK:   [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], <2 x float> [[LANE3]], 3
7721 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
7722 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP9]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP10]]
7723 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
7724 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7725 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7726 // CHECK:   ret void
test_vld4_dup_f32(float32_t const * a)7727 float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
7728   return vld4_dup_f32(a);
7729 }
7730 
7731 // CHECK-LABEL: define void @test_vld4_dup_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7732 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
7733 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7734 // CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
7735 // CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
7736 // CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
7737 // CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
7738 // CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
7739 // CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
7740 // CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
7741 // CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
7742 // CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
7743 // CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
7744 // CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
7745 // CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
7746 // CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
7747 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7748 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
7749 // CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
7750 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7751 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
7752 // CHECK:   ret void
test_vld4_dup_p8(poly8_t const * a)7753 poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
7754   return vld4_dup_p8(a);
7755 }
7756 
7757 // CHECK-LABEL: define void @test_vld4_dup_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7758 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
7759 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7760 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7761 // CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7762 // CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7763 // CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7764 // CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7765 // CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7766 // CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7767 // CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7768 // CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7769 // CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7770 // CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7771 // CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7772 // CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7773 // CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7774 // CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7775 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7776 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
7777 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7778 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7779 // CHECK:   ret void
test_vld4_dup_p16(poly16_t const * a)7780 poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
7781   return vld4_dup_p16(a);
7782 }
7783 
7784 
7785 // CHECK-LABEL: define void @test_vld4q_lane_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
7786 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
7787 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
7788 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
7789 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
7790 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
7791 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7792 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
7793 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
7794 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7795 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7796 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
7797 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7798 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
7799 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
7800 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
7801 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7802 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
7803 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
7804 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
7805 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7806 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
7807 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
7808 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
7809 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7810 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
7811 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
7812 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
7813 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
7814 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
7815 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
7816 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
7817 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
7818 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7819 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
7820 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
7821 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7822 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7823 // CHECK:   ret void
test_vld4q_lane_u16(uint16_t const * a,uint16x8x4_t b)7824 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
7825   return vld4q_lane_u16(a, b, 7);
7826 }
7827 
7828 // CHECK-LABEL: define void @test_vld4q_lane_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
7829 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
7830 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
7831 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
7832 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
7833 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
7834 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7835 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
7836 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
7837 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7838 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7839 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
7840 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7841 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
7842 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
7843 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
7844 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7845 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
7846 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
7847 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
7848 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7849 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
7850 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
7851 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
7852 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7853 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
7854 // CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
7855 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
7856 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
7857 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
7858 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
7859 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
7860 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
7861 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7862 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
7863 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
7864 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7865 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7866 // CHECK:   ret void
test_vld4q_lane_u32(uint32_t const * a,uint32x4x4_t b)7867 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
7868   return vld4q_lane_u32(a, b, 3);
7869 }
7870 
7871 // CHECK-LABEL: define void @test_vld4q_lane_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
7872 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
7873 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
7874 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
7875 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
7876 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
7877 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7878 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
7879 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
7880 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7881 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7882 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
7883 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7884 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
7885 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
7886 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
7887 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7888 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
7889 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
7890 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
7891 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7892 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
7893 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
7894 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
7895 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7896 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
7897 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
7898 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
7899 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
7900 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
7901 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
7902 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
7903 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
7904 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7905 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
7906 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
7907 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7908 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7909 // CHECK:   ret void
test_vld4q_lane_s16(int16_t const * a,int16x8x4_t b)7910 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
7911   return vld4q_lane_s16(a, b, 7);
7912 }
7913 
7914 // CHECK-LABEL: define void @test_vld4q_lane_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
7915 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
7916 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
7917 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
7918 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
7919 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
7920 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7921 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
7922 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
7923 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7924 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7925 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
7926 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7927 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
7928 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
7929 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
7930 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7931 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
7932 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
7933 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
7934 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7935 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
7936 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
7937 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
7938 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7939 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
7940 // CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
7941 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
7942 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
7943 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
7944 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
7945 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
7946 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
7947 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7948 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
7949 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
7950 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7951 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7952 // CHECK:   ret void
test_vld4q_lane_s32(int32_t const * a,int32x4x4_t b)7953 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
7954   return vld4q_lane_s32(a, b, 3);
7955 }
7956 
7957 // CHECK-LABEL: define void @test_vld4q_lane_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a, [8 x i64] %b.coerce) #0 {
7958 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
7959 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
7960 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
7961 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
7962 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
7963 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7964 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
7965 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
7966 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7967 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7968 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
7969 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7970 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
7971 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
7972 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
7973 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7974 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
7975 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
7976 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
7977 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7978 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
7979 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
7980 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
7981 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7982 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
7983 // CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
7984 // CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
7985 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
7986 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
7987 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
7988 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
7989 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
7990 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7991 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
7992 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
7993 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7994 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7995 // CHECK:   ret void
test_vld4q_lane_f16(float16_t const * a,float16x8x4_t b)7996 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
7997   return vld4q_lane_f16(a, b, 7);
7998 }
7999 
8000 // CHECK-LABEL: define void @test_vld4q_lane_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a, [8 x i64] %b.coerce) #0 {
8001 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
8002 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
8003 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
8004 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
8005 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
8006 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
8007 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
8008 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
8009 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
8010 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
8011 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
8012 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8013 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
8014 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
8015 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
8016 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8017 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
8018 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
8019 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
8020 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8021 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
8022 // CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
8023 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
8024 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8025 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
8026 // CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
8027 // CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
8028 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
8029 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
8030 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
8031 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
8032 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], <4 x float> [[TMP16]], i32 3, i32 4)
8033 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
8034 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP17]]
8035 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
8036 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
8037 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
8038 // CHECK:   ret void
test_vld4q_lane_f32(float32_t const * a,float32x4x4_t b)8039 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
8040   return vld4q_lane_f32(a, b, 3);
8041 }
8042 
8043 // CHECK-LABEL: define void @test_vld4q_lane_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
8044 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
8045 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
8046 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
8047 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
8048 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
8049 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
8050 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
8051 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
8052 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
8053 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
8054 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8055 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8056 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
8057 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
8058 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
8059 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8060 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
8061 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
8062 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
8063 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8064 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
8065 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
8066 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
8067 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8068 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
8069 // CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
8070 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
8071 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
8072 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
8073 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
8074 // CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
8075 // CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
8076 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
8077 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
8078 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
8079 // CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
8080 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
8081 // CHECK:   ret void
test_vld4q_lane_p16(poly16_t const * a,poly16x8x4_t b)8082 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
8083   return vld4q_lane_p16(a, b, 7);
8084 }
8085 
8086 // CHECK-LABEL: define void @test_vld4_lane_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
8087 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
8088 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
8089 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
8090 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
8091 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
8092 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8093 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
8094 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
8095 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8096 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
8097 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8098 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
8099 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
8100 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8101 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
8102 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
8103 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8104 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
8105 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
8106 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8107 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
8108 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
8109 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
8110 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
8111 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
8112 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
8113 // CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
8114 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
8115 // CHECK:   ret void
test_vld4_lane_u8(uint8_t const * a,uint8x8x4_t b)8116 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
8117   return vld4_lane_u8(a, b, 7);
8118 }
8119 
8120 // CHECK-LABEL: define void @test_vld4_lane_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
8121 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
8122 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
8123 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
8124 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
8125 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
8126 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8127 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
8128 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
8129 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8130 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
8131 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8132 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8133 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
8134 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
8135 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
8136 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8137 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
8138 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
8139 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
8140 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8141 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
8142 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
8143 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
8144 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8145 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
8146 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
8147 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
8148 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8149 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8150 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8151 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8152 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8153 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8154 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8155 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
8156 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
8157 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8158 // CHECK:   ret void
test_vld4_lane_u16(uint16_t const * a,uint16x4x4_t b)8159 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
8160   return vld4_lane_u16(a, b, 3);
8161 }
8162 
8163 // CHECK-LABEL: define void @test_vld4_lane_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
8164 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
8165 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
8166 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
8167 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
8168 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
8169 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8170 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
8171 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
8172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8173 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
8174 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
8175 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8176 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
8177 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
8178 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
8179 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8180 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
8181 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
8182 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
8183 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8184 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
8185 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
8186 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
8187 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8188 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
8189 // CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
8190 // CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
8191 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
8192 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
8193 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
8194 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
8195 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
8196 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
8197 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
8198 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
8199 // CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
8200 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8201 // CHECK:   ret void
test_vld4_lane_u32(uint32_t const * a,uint32x2x4_t b)8202 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
8203   return vld4_lane_u32(a, b, 1);
8204 }
8205 
8206 // CHECK-LABEL: define void @test_vld4_lane_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
8207 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
8208 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
8209 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
8210 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
8211 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
8212 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8213 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
8214 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
8215 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8216 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
8217 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8218 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
8219 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
8220 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8221 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
8222 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
8223 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8224 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
8225 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
8226 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8227 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
8228 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
8229 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
8230 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
8231 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
8232 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
8233 // CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
8234 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
8235 // CHECK:   ret void
test_vld4_lane_s8(int8_t const * a,int8x8x4_t b)8236 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
8237   return vld4_lane_s8(a, b, 7);
8238 }
8239 
8240 // CHECK-LABEL: define void @test_vld4_lane_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
8241 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
8242 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
8243 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
8244 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
8245 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
8246 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8247 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
8248 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
8249 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8250 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
8251 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8252 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8253 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
8254 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
8255 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
8256 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8257 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
8258 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
8259 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
8260 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8261 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
8262 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
8263 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
8264 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8265 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
8266 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
8267 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
8268 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8269 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8270 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8271 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8272 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8273 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8274 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8275 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
8276 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
8277 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8278 // CHECK:   ret void
test_vld4_lane_s16(int16_t const * a,int16x4x4_t b)8279 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
8280   return vld4_lane_s16(a, b, 3);
8281 }
8282 
8283 // CHECK-LABEL: define void @test_vld4_lane_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
8284 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
8285 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
8286 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
8287 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
8288 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
8289 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8290 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
8291 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
8292 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8293 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
8294 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
8295 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8296 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
8297 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
8298 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
8299 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8300 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
8301 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
8302 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
8303 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8304 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
8305 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
8306 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
8307 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8308 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
8309 // CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
8310 // CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
8311 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
8312 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
8313 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
8314 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
8315 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
8316 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
8317 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
8318 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
8319 // CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
8320 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8321 // CHECK:   ret void
test_vld4_lane_s32(int32_t const * a,int32x2x4_t b)8322 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
8323   return vld4_lane_s32(a, b, 1);
8324 }
8325 
8326 // CHECK-LABEL: define void @test_vld4_lane_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
8327 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
8328 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
8329 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
8330 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
8331 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
8332 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8333 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
8334 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
8335 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8336 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
8337 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
8338 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8339 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
8340 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
8341 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
8342 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8343 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
8344 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
8345 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
8346 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8347 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
8348 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
8349 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
8350 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8351 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
8352 // CHECK:   [[TMP11:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
8353 // CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
8354 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8355 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8356 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8357 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8358 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8359 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8360 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8361 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
8362 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
8363 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8364 // CHECK:   ret void
test_vld4_lane_f16(float16_t const * a,float16x4x4_t b)8365 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
8366   return vld4_lane_f16(a, b, 3);
8367 }
8368 
8369 // CHECK-LABEL: define void @test_vld4_lane_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
8370 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
8371 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
8372 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
8373 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
8374 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
8375 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8376 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
8377 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
8378 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8379 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
8380 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
8381 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8382 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
8383 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
8384 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
8385 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8386 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
8387 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
8388 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
8389 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8390 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
8391 // CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
8392 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
8393 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8394 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
8395 // CHECK:   [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
8396 // CHECK:   [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
8397 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
8398 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
8399 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
8400 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
8401 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x float> [[TMP16]], i32 1, i32 4)
8402 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
8403 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP17]]
8404 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
8405 // CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
8406 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8407 // CHECK:   ret void
test_vld4_lane_f32(float32_t const * a,float32x2x4_t b)8408 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
8409   return vld4_lane_f32(a, b, 1);
8410 }
8411 
8412 // CHECK-LABEL: define void @test_vld4_lane_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
8413 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
8414 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
8415 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
8416 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
8417 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
8418 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8419 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
8420 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
8421 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8422 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
8423 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8424 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
8425 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
8426 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8427 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
8428 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
8429 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8430 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
8431 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
8432 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8433 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
8434 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
8435 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
8436 // CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
8437 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
8438 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
8439 // CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
8440 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
8441 // CHECK:   ret void
test_vld4_lane_p8(poly8_t const * a,poly8x8x4_t b)8442 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
8443   return vld4_lane_p8(a, b, 7);
8444 }
8445 
8446 // CHECK-LABEL: define void @test_vld4_lane_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
8447 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
8448 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
8449 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
8450 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
8451 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
8452 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8453 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
8454 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
8455 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8456 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
8457 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8458 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8459 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
8460 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
8461 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
8462 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8463 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
8464 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
8465 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
8466 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8467 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
8468 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
8469 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
8470 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8471 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
8472 // CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
8473 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
8474 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8475 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8476 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8477 // CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8478 // CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8479 // CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8480 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8481 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
8482 // CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
8483 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8484 // CHECK:   ret void
test_vld4_lane_p16(poly16_t const * a,poly16x4x4_t b)8485 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
8486   return vld4_lane_p16(a, b, 3);
8487 }
8488 
8489 
8490 // CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
8491 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8492 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
test_vmax_s8(int8x8_t a,int8x8_t b)8493 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
8494   return vmax_s8(a, b);
8495 }
8496 
8497 // CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
8498 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8499 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8500 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8501 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8502 // CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
8503 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
8504 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
8505 // CHECK:   ret <4 x i16> [[TMP2]]
test_vmax_s16(int16x4_t a,int16x4_t b)8506 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
8507   return vmax_s16(a, b);
8508 }
8509 
8510 // CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
8511 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8512 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8513 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8514 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8515 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
8516 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
8517 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
8518 // CHECK:   ret <2 x i32> [[TMP2]]
test_vmax_s32(int32x2_t a,int32x2_t b)8519 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
8520   return vmax_s32(a, b);
8521 }
8522 
8523 // CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
8524 // CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8525 // CHECK:   ret <8 x i8> [[VMAX_V_I]]
test_vmax_u8(uint8x8_t a,uint8x8_t b)8526 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
8527   return vmax_u8(a, b);
8528 }
8529 
8530 // CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
8531 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8532 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8533 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8534 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8535 // CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
8536 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
8537 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
8538 // CHECK:   ret <4 x i16> [[TMP2]]
test_vmax_u16(uint16x4_t a,uint16x4_t b)8539 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
8540   return vmax_u16(a, b);
8541 }
8542 
8543 // CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
8544 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8545 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8546 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8547 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8548 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
8549 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
8550 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
8551 // CHECK:   ret <2 x i32> [[TMP2]]
test_vmax_u32(uint32x2_t a,uint32x2_t b)8552 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
8553   return vmax_u32(a, b);
8554 }
8555 
8556 // CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
8557 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
8558 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
8559 // CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
8560 // CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
8561 // CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[VMAX_V_I]], <2 x float> [[VMAX_V1_I]]) #4
8562 // CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
8563 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x float>
8564 // CHECK:   ret <2 x float> [[TMP2]]
test_vmax_f32(float32x2_t a,float32x2_t b)8565 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
8566   return vmax_f32(a, b);
8567 }
8568 
8569 // CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
8570 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8571 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
test_vmaxq_s8(int8x16_t a,int8x16_t b)8572 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
8573   return vmaxq_s8(a, b);
8574 }
8575 
8576 // CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
8577 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8578 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8579 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8580 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8581 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
8582 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
8583 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
8584 // CHECK:   ret <8 x i16> [[TMP2]]
test_vmaxq_s16(int16x8_t a,int16x8_t b)8585 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
8586   return vmaxq_s16(a, b);
8587 }
8588 
8589 // CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
8590 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8591 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8592 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8593 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8594 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
8595 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
8596 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
8597 // CHECK:   ret <4 x i32> [[TMP2]]
test_vmaxq_s32(int32x4_t a,int32x4_t b)8598 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
8599   return vmaxq_s32(a, b);
8600 }
8601 
8602 // CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
8603 // CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8604 // CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
test_vmaxq_u8(uint8x16_t a,uint8x16_t b)8605 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
8606   return vmaxq_u8(a, b);
8607 }
8608 
8609 // CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
8610 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8611 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8612 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8613 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8614 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
8615 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
8616 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
8617 // CHECK:   ret <8 x i16> [[TMP2]]
test_vmaxq_u16(uint16x8_t a,uint16x8_t b)8618 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
8619   return vmaxq_u16(a, b);
8620 }
8621 
8622 // CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
8623 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8624 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8625 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8626 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8627 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
8628 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
8629 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
8630 // CHECK:   ret <4 x i32> [[TMP2]]
test_vmaxq_u32(uint32x4_t a,uint32x4_t b)8631 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
8632   return vmaxq_u32(a, b);
8633 }
8634 
8635 // CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
8636 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
8637 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
8638 // CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
8639 // CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
8640 // CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[VMAXQ_V_I]], <4 x float> [[VMAXQ_V1_I]]) #4
8641 // CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
8642 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x float>
8643 // CHECK:   ret <4 x float> [[TMP2]]
test_vmaxq_f32(float32x4_t a,float32x4_t b)8644 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
8645   return vmaxq_f32(a, b);
8646 }
8647 
8648 
8649 // CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
8650 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8651 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
test_vmin_s8(int8x8_t a,int8x8_t b)8652 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
8653   return vmin_s8(a, b);
8654 }
8655 
8656 // CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
8657 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8658 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8659 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8660 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8661 // CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
8662 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
8663 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
8664 // CHECK:   ret <4 x i16> [[TMP2]]
test_vmin_s16(int16x4_t a,int16x4_t b)8665 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
8666   return vmin_s16(a, b);
8667 }
8668 
8669 // CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
8670 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8671 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8672 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8673 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8674 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
8675 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
8676 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
8677 // CHECK:   ret <2 x i32> [[TMP2]]
test_vmin_s32(int32x2_t a,int32x2_t b)8678 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
8679   return vmin_s32(a, b);
8680 }
8681 
8682 // CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
8683 // CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8684 // CHECK:   ret <8 x i8> [[VMIN_V_I]]
test_vmin_u8(uint8x8_t a,uint8x8_t b)8685 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
8686   return vmin_u8(a, b);
8687 }
8688 
8689 // CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
8690 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8691 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8692 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8693 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8694 // CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
8695 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
8696 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
8697 // CHECK:   ret <4 x i16> [[TMP2]]
test_vmin_u16(uint16x4_t a,uint16x4_t b)8698 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
8699   return vmin_u16(a, b);
8700 }
8701 
8702 // CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
8703 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8704 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8705 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8706 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8707 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
8708 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
8709 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
8710 // CHECK:   ret <2 x i32> [[TMP2]]
test_vmin_u32(uint32x2_t a,uint32x2_t b)8711 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
8712   return vmin_u32(a, b);
8713 }
8714 
8715 // CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
8716 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
8717 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
8718 // CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
8719 // CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
8720 // CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[VMIN_V_I]], <2 x float> [[VMIN_V1_I]]) #4
8721 // CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
8722 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x float>
8723 // CHECK:   ret <2 x float> [[TMP2]]
test_vmin_f32(float32x2_t a,float32x2_t b)8724 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
8725   return vmin_f32(a, b);
8726 }
8727 
8728 // CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
8729 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8730 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
test_vminq_s8(int8x16_t a,int8x16_t b)8731 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
8732   return vminq_s8(a, b);
8733 }
8734 
8735 // CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
8736 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8737 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8738 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8739 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8740 // CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
8741 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
8742 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
8743 // CHECK:   ret <8 x i16> [[TMP2]]
test_vminq_s16(int16x8_t a,int16x8_t b)8744 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
8745   return vminq_s16(a, b);
8746 }
8747 
8748 // CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
8749 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8750 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8751 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8752 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8753 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
8754 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
8755 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
8756 // CHECK:   ret <4 x i32> [[TMP2]]
test_vminq_s32(int32x4_t a,int32x4_t b)8757 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
8758   return vminq_s32(a, b);
8759 }
8760 
8761 // CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
8762 // CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8763 // CHECK:   ret <16 x i8> [[VMINQ_V_I]]
test_vminq_u8(uint8x16_t a,uint8x16_t b)8764 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
8765   return vminq_u8(a, b);
8766 }
8767 
8768 // CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
8769 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8770 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8771 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8772 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8773 // CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
8774 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
8775 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
8776 // CHECK:   ret <8 x i16> [[TMP2]]
test_vminq_u16(uint16x8_t a,uint16x8_t b)8777 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
8778   return vminq_u16(a, b);
8779 }
8780 
8781 // CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
8782 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8783 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8784 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8785 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8786 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
8787 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
8788 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
8789 // CHECK:   ret <4 x i32> [[TMP2]]
test_vminq_u32(uint32x4_t a,uint32x4_t b)8790 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
8791   return vminq_u32(a, b);
8792 }
8793 
8794 // CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
8795 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
8796 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
8797 // CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
8798 // CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
8799 // CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[VMINQ_V_I]], <4 x float> [[VMINQ_V1_I]]) #4
8800 // CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
8801 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x float>
8802 // CHECK:   ret <4 x float> [[TMP2]]
test_vminq_f32(float32x4_t a,float32x4_t b)8803 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
8804   return vminq_f32(a, b);
8805 }
8806 
8807 
8808 // CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8809 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
8810 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
8811 // CHECK:   ret <8 x i8> [[ADD_I]]
test_vmla_s8(int8x8_t a,int8x8_t b,int8x8_t c)8812 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
8813   return vmla_s8(a, b, c);
8814 }
8815 
8816 // CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8817 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
8818 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
8819 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vmla_s16(int16x4_t a,int16x4_t b,int16x4_t c)8820 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
8821   return vmla_s16(a, b, c);
8822 }
8823 
8824 // CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8825 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
8826 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
8827 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vmla_s32(int32x2_t a,int32x2_t b,int32x2_t c)8828 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
8829   return vmla_s32(a, b, c);
8830 }
8831 
8832 // CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
8833 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
8834 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
8835 // CHECK:   ret <2 x float> [[ADD_I]]
test_vmla_f32(float32x2_t a,float32x2_t b,float32x2_t c)8836 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
8837   return vmla_f32(a, b, c);
8838 }
8839 
8840 // CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8841 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
8842 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
8843 // CHECK:   ret <8 x i8> [[ADD_I]]
test_vmla_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)8844 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
8845   return vmla_u8(a, b, c);
8846 }
8847 
8848 // CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8849 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
8850 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
8851 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vmla_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)8852 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
8853   return vmla_u16(a, b, c);
8854 }
8855 
8856 // CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8857 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
8858 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
8859 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vmla_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)8860 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
8861   return vmla_u32(a, b, c);
8862 }
8863 
8864 // CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
8865 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
8866 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
8867 // CHECK:   ret <16 x i8> [[ADD_I]]
test_vmlaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)8868 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
8869   return vmlaq_s8(a, b, c);
8870 }
8871 
8872 // CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
8873 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
8874 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
8875 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vmlaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)8876 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
8877   return vmlaq_s16(a, b, c);
8878 }
8879 
8880 // CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
8881 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
8882 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
8883 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)8884 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
8885   return vmlaq_s32(a, b, c);
8886 }
8887 
8888 // CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
8889 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
8890 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
8891 // CHECK:   ret <4 x float> [[ADD_I]]
test_vmlaq_f32(float32x4_t a,float32x4_t b,float32x4_t c)8892 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
8893   return vmlaq_f32(a, b, c);
8894 }
8895 
8896 // CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
8897 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
8898 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
8899 // CHECK:   ret <16 x i8> [[ADD_I]]
test_vmlaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)8900 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
8901   return vmlaq_u8(a, b, c);
8902 }
8903 
8904 // CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
8905 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
8906 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
8907 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vmlaq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t c)8908 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
8909   return vmlaq_u16(a, b, c);
8910 }
8911 
8912 // CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
8913 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
8914 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
8915 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlaq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t c)8916 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
8917   return vmlaq_u32(a, b, c);
8918 }
8919 
8920 
8921 // CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8922 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
8923 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
8924 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vmlal_s8(int16x8_t a,int8x8_t b,int8x8_t c)8925 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
8926   return vmlal_s8(a, b, c);
8927 }
8928 
8929 // CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8930 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8931 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
8932 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8933 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8934 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
8935 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
8936 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)8937 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
8938   return vmlal_s16(a, b, c);
8939 }
8940 
8941 // CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8942 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8943 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
8944 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8945 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8946 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
8947 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
8948 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vmlal_s32(int64x2_t a,int32x2_t b,int32x2_t c)8949 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
8950   return vmlal_s32(a, b, c);
8951 }
8952 
8953 // CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8954 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
8955 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
8956 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vmlal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)8957 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
8958   return vmlal_u8(a, b, c);
8959 }
8960 
8961 // CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8962 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8963 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
8964 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8965 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8966 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
8967 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
8968 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)8969 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
8970   return vmlal_u16(a, b, c);
8971 }
8972 
8973 // CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8974 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8975 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
8976 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8977 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8978 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
8979 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
8980 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vmlal_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)8981 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
8982   return vmlal_u32(a, b, c);
8983 }
8984 
8985 
8986 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8987 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8988 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8989 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
8990 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8991 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8992 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
8993 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
8994 // CHECK:   ret <4 x i32> [[ADD]]
test_vmlal_lane_s16(int32x4_t a,int16x4_t b,int16x4_t c)8995 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
8996   return vmlal_lane_s16(a, b, c, 3);
8997 }
8998 
8999 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9000 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9001 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9002 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9003 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9004 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9005 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9006 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
9007 // CHECK:   ret <2 x i64> [[ADD]]
test_vmlal_lane_s32(int64x2_t a,int32x2_t b,int32x2_t c)9008 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9009   return vmlal_lane_s32(a, b, c, 1);
9010 }
9011 
9012 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9013 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9014 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9015 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9016 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9017 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9018 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
9019 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
9020 // CHECK:   ret <4 x i32> [[ADD]]
test_vmlal_lane_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)9021 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
9022   return vmlal_lane_u16(a, b, c, 3);
9023 }
9024 
9025 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9026 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9027 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9028 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9029 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9030 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9031 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9032 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
9033 // CHECK:   ret <2 x i64> [[ADD]]
test_vmlal_lane_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)9034 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
9035   return vmlal_lane_u32(a, b, c, 1);
9036 }
9037 
9038 
9039 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
9040 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9041 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9042 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9043 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9044 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9045 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9046 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9047 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9048 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9049 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
9050 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)9051 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9052   return vmlal_n_s16(a, b, c);
9053 }
9054 
9055 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9056 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9057 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9058 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9059 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9060 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9061 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9062 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9063 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
9064 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)9065 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9066   return vmlal_n_s32(a, b, c);
9067 }
9068 
9069 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9070 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9071 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9072 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9073 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9074 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9075 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9076 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9077 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9078 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9079 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
9080 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlal_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)9081 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
9082   return vmlal_n_u16(a, b, c);
9083 }
9084 
9085 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9086 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9087 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9088 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9089 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9090 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9091 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9092 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9093 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
9094 // CHECK:   ret <2 x i64> [[ADD_I]]
test_vmlal_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)9095 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
9096   return vmlal_n_u32(a, b, c);
9097 }
9098 
9099 
9100 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9101 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9102 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9103 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
9104 // CHECK:   ret <4 x i16> [[ADD]]
test_vmla_lane_s16(int16x4_t a,int16x4_t b,int16x4_t c)9105 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
9106   return vmla_lane_s16(a, b, c, 3);
9107 }
9108 
9109 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9110 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9111 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9112 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
9113 // CHECK:   ret <2 x i32> [[ADD]]
test_vmla_lane_s32(int32x2_t a,int32x2_t b,int32x2_t c)9114 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
9115   return vmla_lane_s32(a, b, c, 1);
9116 }
9117 
9118 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9119 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9120 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9121 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
9122 // CHECK:   ret <4 x i16> [[ADD]]
test_vmla_lane_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)9123 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
9124   return vmla_lane_u16(a, b, c, 3);
9125 }
9126 
9127 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9128 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9129 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9130 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
9131 // CHECK:   ret <2 x i32> [[ADD]]
test_vmla_lane_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)9132 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
9133   return vmla_lane_u32(a, b, c, 1);
9134 }
9135 
9136 // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
9137 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
9138 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
9139 // CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
9140 // CHECK:   ret <2 x float> [[ADD]]
test_vmla_lane_f32(float32x2_t a,float32x2_t b,float32x2_t c)9141 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
9142   return vmla_lane_f32(a, b, c, 1);
9143 }
9144 
9145 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9146 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9147 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9148 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
9149 // CHECK:   ret <8 x i16> [[ADD]]
test_vmlaq_lane_s16(int16x8_t a,int16x8_t b,int16x4_t c)9150 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
9151   return vmlaq_lane_s16(a, b, c, 3);
9152 }
9153 
9154 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9155 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9156 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9157 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
9158 // CHECK:   ret <4 x i32> [[ADD]]
test_vmlaq_lane_s32(int32x4_t a,int32x4_t b,int32x2_t c)9159 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
9160   return vmlaq_lane_s32(a, b, c, 1);
9161 }
9162 
9163 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9164 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9165 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9166 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
9167 // CHECK:   ret <8 x i16> [[ADD]]
test_vmlaq_lane_u16(uint16x8_t a,uint16x8_t b,uint16x4_t c)9168 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
9169   return vmlaq_lane_u16(a, b, c, 3);
9170 }
9171 
9172 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9173 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9174 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9175 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
9176 // CHECK:   ret <4 x i32> [[ADD]]
test_vmlaq_lane_u32(uint32x4_t a,uint32x4_t b,uint32x2_t c)9177 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
9178   return vmlaq_lane_u32(a, b, c, 1);
9179 }
9180 
9181 // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
9182 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9183 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
9184 // CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
9185 // CHECK:   ret <4 x float> [[ADD]]
test_vmlaq_lane_f32(float32x4_t a,float32x4_t b,float32x2_t c)9186 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
9187   return vmlaq_lane_f32(a, b, c, 1);
9188 }
9189 
9190 
9191 // CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
9192 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9193 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9194 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9195 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9196 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9197 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
9198 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vmla_n_s16(int16x4_t a,int16x4_t b,int16_t c)9199 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
9200   return vmla_n_s16(a, b, c);
9201 }
9202 
9203 // CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9204 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9205 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9206 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9207 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
9208 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vmla_n_s32(int32x2_t a,int32x2_t b,int32_t c)9209 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
9210   return vmla_n_s32(a, b, c);
9211 }
9212 
9213 // CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9214 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9215 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9216 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9217 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9218 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9219 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
9220 // CHECK:   ret <4 x i16> [[ADD_I]]
test_vmla_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)9221 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
9222   return vmla_n_u16(a, b, c);
9223 }
9224 
9225 // CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9226 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9227 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9228 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9229 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
9230 // CHECK:   ret <2 x i32> [[ADD_I]]
test_vmla_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)9231 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
9232   return vmla_n_u32(a, b, c);
9233 }
9234 
9235 // CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
9236 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
9237 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
9238 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
9239 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
9240 // CHECK:   ret <2 x float> [[ADD_I]]
test_vmla_n_f32(float32x2_t a,float32x2_t b,float32_t c)9241 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
9242   return vmla_n_f32(a, b, c);
9243 }
9244 
9245 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
9246 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9247 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9248 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9249 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9250 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9251 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9252 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9253 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9254 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9255 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
9256 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vmlaq_n_s16(int16x8_t a,int16x8_t b,int16_t c)9257 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
9258   return vmlaq_n_s16(a, b, c);
9259 }
9260 
9261 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9262 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9263 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9264 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9265 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9266 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9267 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
9268 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlaq_n_s32(int32x4_t a,int32x4_t b,int32_t c)9269 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
9270   return vmlaq_n_s32(a, b, c);
9271 }
9272 
9273 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
9274 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9275 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9276 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9277 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9278 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9279 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9280 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9281 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9282 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9283 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
9284 // CHECK:   ret <8 x i16> [[ADD_I]]
test_vmlaq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)9285 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
9286   return vmlaq_n_u16(a, b, c);
9287 }
9288 
9289 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9290 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9291 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9292 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9293 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9294 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9295 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
9296 // CHECK:   ret <4 x i32> [[ADD_I]]
test_vmlaq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)9297 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
9298   return vmlaq_n_u32(a, b, c);
9299 }
9300 
9301 // CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
9302 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
9303 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
9304 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
9305 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
9306 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
9307 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
9308 // CHECK:   ret <4 x float> [[ADD_I]]
test_vmlaq_n_f32(float32x4_t a,float32x4_t b,float32_t c)9309 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
9310   return vmlaq_n_f32(a, b, c);
9311 }
9312 
9313 
9314 // CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9315 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
9316 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
9317 // CHECK:   ret <8 x i8> [[SUB_I]]
test_vmls_s8(int8x8_t a,int8x8_t b,int8x8_t c)9318 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
9319   return vmls_s8(a, b, c);
9320 }
9321 
9322 // CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9323 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
9324 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9325 // CHECK:   ret <4 x i16> [[SUB_I]]
test_vmls_s16(int16x4_t a,int16x4_t b,int16x4_t c)9326 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
9327   return vmls_s16(a, b, c);
9328 }
9329 
9330 // CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9331 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
9332 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9333 // CHECK:   ret <2 x i32> [[SUB_I]]
test_vmls_s32(int32x2_t a,int32x2_t b,int32x2_t c)9334 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
9335   return vmls_s32(a, b, c);
9336 }
9337 
9338 // CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
9339 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
9340 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
9341 // CHECK:   ret <2 x float> [[SUB_I]]
test_vmls_f32(float32x2_t a,float32x2_t b,float32x2_t c)9342 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
9343   return vmls_f32(a, b, c);
9344 }
9345 
9346 // CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9347 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
9348 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
9349 // CHECK:   ret <8 x i8> [[SUB_I]]
test_vmls_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)9350 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
9351   return vmls_u8(a, b, c);
9352 }
9353 
9354 // CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9355 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
9356 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9357 // CHECK:   ret <4 x i16> [[SUB_I]]
test_vmls_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)9358 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
9359   return vmls_u16(a, b, c);
9360 }
9361 
9362 // CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9363 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
9364 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9365 // CHECK:   ret <2 x i32> [[SUB_I]]
test_vmls_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)9366 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
9367   return vmls_u32(a, b, c);
9368 }
9369 
9370 // CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
9371 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
9372 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
9373 // CHECK:   ret <16 x i8> [[SUB_I]]
test_vmlsq_s8(int8x16_t a,int8x16_t b,int8x16_t c)9374 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
9375   return vmlsq_s8(a, b, c);
9376 }
9377 
9378 // CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
9379 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
9380 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9381 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vmlsq_s16(int16x8_t a,int16x8_t b,int16x8_t c)9382 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
9383   return vmlsq_s16(a, b, c);
9384 }
9385 
9386 // CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
9387 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
9388 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9389 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsq_s32(int32x4_t a,int32x4_t b,int32x4_t c)9390 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
9391   return vmlsq_s32(a, b, c);
9392 }
9393 
9394 // CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
9395 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
9396 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
9397 // CHECK:   ret <4 x float> [[SUB_I]]
test_vmlsq_f32(float32x4_t a,float32x4_t b,float32x4_t c)9398 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
9399   return vmlsq_f32(a, b, c);
9400 }
9401 
9402 // CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
9403 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
9404 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
9405 // CHECK:   ret <16 x i8> [[SUB_I]]
test_vmlsq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)9406 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
9407   return vmlsq_u8(a, b, c);
9408 }
9409 
9410 // CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
9411 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
9412 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9413 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vmlsq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t c)9414 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
9415   return vmlsq_u16(a, b, c);
9416 }
9417 
9418 // CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
9419 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
9420 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9421 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t c)9422 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
9423   return vmlsq_u32(a, b, c);
9424 }
9425 
9426 
9427 // CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9428 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
9429 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
9430 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vmlsl_s8(int16x8_t a,int8x8_t b,int8x8_t c)9431 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
9432   return vmlsl_s8(a, b, c);
9433 }
9434 
9435 // CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9436 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9437 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9438 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9439 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9440 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9441 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9442 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)9443 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9444   return vmlsl_s16(a, b, c);
9445 }
9446 
9447 // CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9448 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9449 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9450 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9451 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9452 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9453 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9454 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vmlsl_s32(int64x2_t a,int32x2_t b,int32x2_t c)9455 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9456   return vmlsl_s32(a, b, c);
9457 }
9458 
9459 // CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9460 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
9461 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
9462 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vmlsl_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)9463 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
9464   return vmlsl_u8(a, b, c);
9465 }
9466 
9467 // CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9468 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9469 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9470 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9471 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9472 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9473 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9474 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsl_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)9475 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
9476   return vmlsl_u16(a, b, c);
9477 }
9478 
9479 // CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9480 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9481 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9482 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9483 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9484 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9485 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9486 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vmlsl_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)9487 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
9488   return vmlsl_u32(a, b, c);
9489 }
9490 
9491 
9492 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9493 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9494 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9495 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9496 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9497 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9498 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
9499 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
9500 // CHECK:   ret <4 x i32> [[SUB]]
test_vmlsl_lane_s16(int32x4_t a,int16x4_t b,int16x4_t c)9501 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9502   return vmlsl_lane_s16(a, b, c, 3);
9503 }
9504 
9505 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9506 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9507 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9508 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9509 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9510 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9511 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9512 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
9513 // CHECK:   ret <2 x i64> [[SUB]]
test_vmlsl_lane_s32(int64x2_t a,int32x2_t b,int32x2_t c)9514 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9515   return vmlsl_lane_s32(a, b, c, 1);
9516 }
9517 
9518 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9519 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9520 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9521 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9522 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9523 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9524 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
9525 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
9526 // CHECK:   ret <4 x i32> [[SUB]]
test_vmlsl_lane_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)9527 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
9528   return vmlsl_lane_u16(a, b, c, 3);
9529 }
9530 
9531 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9532 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9533 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9534 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9535 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9536 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9537 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9538 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
9539 // CHECK:   ret <2 x i64> [[SUB]]
test_vmlsl_lane_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)9540 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
9541   return vmlsl_lane_u32(a, b, c, 1);
9542 }
9543 
9544 
9545 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
9546 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9547 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9548 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9549 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9550 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9551 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9552 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9553 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9554 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9555 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9556 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)9557 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9558   return vmlsl_n_s16(a, b, c);
9559 }
9560 
9561 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9562 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9563 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9564 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9565 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9566 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9567 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9568 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9569 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9570 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)9571 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9572   return vmlsl_n_s32(a, b, c);
9573 }
9574 
9575 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9576 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9577 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9578 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9579 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9580 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9581 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9582 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9583 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9584 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9585 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9586 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsl_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)9587 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
9588   return vmlsl_n_u16(a, b, c);
9589 }
9590 
9591 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9592 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9593 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9594 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9595 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9596 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9597 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9598 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9599 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9600 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vmlsl_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)9601 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
9602   return vmlsl_n_u32(a, b, c);
9603 }
9604 
9605 
9606 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9607 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9608 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9609 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
9610 // CHECK:   ret <4 x i16> [[SUB]]
test_vmls_lane_s16(int16x4_t a,int16x4_t b,int16x4_t c)9611 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
9612   return vmls_lane_s16(a, b, c, 3);
9613 }
9614 
9615 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9616 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9617 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9618 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
9619 // CHECK:   ret <2 x i32> [[SUB]]
test_vmls_lane_s32(int32x2_t a,int32x2_t b,int32x2_t c)9620 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
9621   return vmls_lane_s32(a, b, c, 1);
9622 }
9623 
9624 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9625 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9626 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9627 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
9628 // CHECK:   ret <4 x i16> [[SUB]]
test_vmls_lane_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)9629 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
9630   return vmls_lane_u16(a, b, c, 3);
9631 }
9632 
9633 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9635 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9636 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
9637 // CHECK:   ret <2 x i32> [[SUB]]
test_vmls_lane_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)9638 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
9639   return vmls_lane_u32(a, b, c, 1);
9640 }
9641 
9642 // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
9643 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
9644 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
9645 // CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
9646 // CHECK:   ret <2 x float> [[SUB]]
test_vmls_lane_f32(float32x2_t a,float32x2_t b,float32x2_t c)9647 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
9648   return vmls_lane_f32(a, b, c, 1);
9649 }
9650 
9651 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9652 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9653 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9654 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
9655 // CHECK:   ret <8 x i16> [[SUB]]
test_vmlsq_lane_s16(int16x8_t a,int16x8_t b,int16x4_t c)9656 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
9657   return vmlsq_lane_s16(a, b, c, 3);
9658 }
9659 
9660 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9661 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9662 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9663 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
9664 // CHECK:   ret <4 x i32> [[SUB]]
test_vmlsq_lane_s32(int32x4_t a,int32x4_t b,int32x2_t c)9665 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
9666   return vmlsq_lane_s32(a, b, c, 1);
9667 }
9668 
9669 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9670 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9671 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9672 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
9673 // CHECK:   ret <8 x i16> [[SUB]]
test_vmlsq_lane_u16(uint16x8_t a,uint16x8_t b,uint16x4_t c)9674 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
9675   return vmlsq_lane_u16(a, b, c, 3);
9676 }
9677 
9678 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9679 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9680 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9681 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
9682 // CHECK:   ret <4 x i32> [[SUB]]
test_vmlsq_lane_u32(uint32x4_t a,uint32x4_t b,uint32x2_t c)9683 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
9684   return vmlsq_lane_u32(a, b, c, 1);
9685 }
9686 
9687 // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
9688 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9689 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
9690 // CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
9691 // CHECK:   ret <4 x float> [[SUB]]
test_vmlsq_lane_f32(float32x4_t a,float32x4_t b,float32x2_t c)9692 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
9693   return vmlsq_lane_f32(a, b, c, 1);
9694 }
9695 
9696 
9697 // CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
9698 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9699 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9700 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9701 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9702 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9703 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9704 // CHECK:   ret <4 x i16> [[SUB_I]]
test_vmls_n_s16(int16x4_t a,int16x4_t b,int16_t c)9705 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
9706   return vmls_n_s16(a, b, c);
9707 }
9708 
9709 // CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9710 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9711 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9712 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9713 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9714 // CHECK:   ret <2 x i32> [[SUB_I]]
test_vmls_n_s32(int32x2_t a,int32x2_t b,int32_t c)9715 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
9716   return vmls_n_s32(a, b, c);
9717 }
9718 
9719 // CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9720 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9721 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9722 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9723 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9724 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9725 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9726 // CHECK:   ret <4 x i16> [[SUB_I]]
test_vmls_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)9727 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
9728   return vmls_n_u16(a, b, c);
9729 }
9730 
9731 // CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9732 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9733 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9734 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9735 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9736 // CHECK:   ret <2 x i32> [[SUB_I]]
test_vmls_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)9737 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
9738   return vmls_n_u32(a, b, c);
9739 }
9740 
9741 // CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
9742 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
9743 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
9744 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
9745 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
9746 // CHECK:   ret <2 x float> [[SUB_I]]
test_vmls_n_f32(float32x2_t a,float32x2_t b,float32_t c)9747 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
9748   return vmls_n_f32(a, b, c);
9749 }
9750 
9751 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
9752 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9753 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9754 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9755 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9756 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9757 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9758 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9759 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9760 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9761 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9762 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vmlsq_n_s16(int16x8_t a,int16x8_t b,int16_t c)9763 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
9764   return vmlsq_n_s16(a, b, c);
9765 }
9766 
9767 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9768 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9769 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9770 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9771 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9772 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9773 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9774 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsq_n_s32(int32x4_t a,int32x4_t b,int32_t c)9775 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
9776   return vmlsq_n_s32(a, b, c);
9777 }
9778 
9779 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
9780 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9781 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9782 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9783 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9784 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9785 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9786 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9787 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9788 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9789 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9790 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vmlsq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)9791 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
9792   return vmlsq_n_u16(a, b, c);
9793 }
9794 
9795 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9796 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9797 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9798 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9799 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9800 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9801 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9802 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vmlsq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)9803 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
9804   return vmlsq_n_u32(a, b, c);
9805 }
9806 
9807 // CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
9808 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
9809 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
9810 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
9811 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
9812 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
9813 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
9814 // CHECK:   ret <4 x float> [[SUB_I]]
test_vmlsq_n_f32(float32x4_t a,float32x4_t b,float32_t c)9815 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
9816   return vmlsq_n_f32(a, b, c);
9817 }
9818 
9819 
9820 // CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
9821 // CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
9822 // CHECK:   ret <8 x i16> [[VMOVL_I]]
test_vmovl_s8(int8x8_t a)9823 int16x8_t test_vmovl_s8(int8x8_t a) {
9824   return vmovl_s8(a);
9825 }
9826 
9827 // CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
9828 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9829 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9830 // CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
9831 // CHECK:   ret <4 x i32> [[VMOVL_I]]
test_vmovl_s16(int16x4_t a)9832 int32x4_t test_vmovl_s16(int16x4_t a) {
9833   return vmovl_s16(a);
9834 }
9835 
9836 // CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
9837 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9838 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9839 // CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
9840 // CHECK:   ret <2 x i64> [[VMOVL_I]]
test_vmovl_s32(int32x2_t a)9841 int64x2_t test_vmovl_s32(int32x2_t a) {
9842   return vmovl_s32(a);
9843 }
9844 
9845 // CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
9846 // CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
9847 // CHECK:   ret <8 x i16> [[VMOVL_I]]
test_vmovl_u8(uint8x8_t a)9848 uint16x8_t test_vmovl_u8(uint8x8_t a) {
9849   return vmovl_u8(a);
9850 }
9851 
9852 // CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
9853 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9854 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9855 // CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
9856 // CHECK:   ret <4 x i32> [[VMOVL_I]]
test_vmovl_u16(uint16x4_t a)9857 uint32x4_t test_vmovl_u16(uint16x4_t a) {
9858   return vmovl_u16(a);
9859 }
9860 
9861 // CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
9862 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9863 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9864 // CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
9865 // CHECK:   ret <2 x i64> [[VMOVL_I]]
test_vmovl_u32(uint32x2_t a)9866 uint64x2_t test_vmovl_u32(uint32x2_t a) {
9867   return vmovl_u32(a);
9868 }
9869 
9870 
9871 // CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
9872 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9873 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
9874 // CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
9875 // CHECK:   ret <8 x i8> [[VMOVN_I]]
test_vmovn_s16(int16x8_t a)9876 int8x8_t test_vmovn_s16(int16x8_t a) {
9877   return vmovn_s16(a);
9878 }
9879 
9880 // CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
9881 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9882 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
9883 // CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
9884 // CHECK:   ret <4 x i16> [[VMOVN_I]]
test_vmovn_s32(int32x4_t a)9885 int16x4_t test_vmovn_s32(int32x4_t a) {
9886   return vmovn_s32(a);
9887 }
9888 
9889 // CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
9890 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9891 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
9892 // CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
9893 // CHECK:   ret <2 x i32> [[VMOVN_I]]
test_vmovn_s64(int64x2_t a)9894 int32x2_t test_vmovn_s64(int64x2_t a) {
9895   return vmovn_s64(a);
9896 }
9897 
9898 // CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
9899 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9900 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
9901 // CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
9902 // CHECK:   ret <8 x i8> [[VMOVN_I]]
test_vmovn_u16(uint16x8_t a)9903 uint8x8_t test_vmovn_u16(uint16x8_t a) {
9904   return vmovn_u16(a);
9905 }
9906 
9907 // CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
9908 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9909 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
9910 // CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
9911 // CHECK:   ret <4 x i16> [[VMOVN_I]]
test_vmovn_u32(uint32x4_t a)9912 uint16x4_t test_vmovn_u32(uint32x4_t a) {
9913   return vmovn_u32(a);
9914 }
9915 
9916 // CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
9917 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9918 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
9919 // CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
9920 // CHECK:   ret <2 x i32> [[VMOVN_I]]
test_vmovn_u64(uint64x2_t a)9921 uint32x2_t test_vmovn_u64(uint64x2_t a) {
9922   return vmovn_u64(a);
9923 }
9924 
9925 
9926 // CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 zeroext %a) #0 {
9927 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
9928 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
9929 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
9930 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
9931 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
9932 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
9933 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
9934 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
9935 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
test_vmov_n_u8(uint8_t a)9936 uint8x8_t test_vmov_n_u8(uint8_t a) {
9937   return vmov_n_u8(a);
9938 }
9939 
9940 // CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 zeroext %a) #0 {
9941 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
9942 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
9943 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
9944 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
9945 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
test_vmov_n_u16(uint16_t a)9946 uint16x4_t test_vmov_n_u16(uint16_t a) {
9947   return vmov_n_u16(a);
9948 }
9949 
9950 // CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 {
9951 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
9952 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
9953 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
test_vmov_n_u32(uint32_t a)9954 uint32x2_t test_vmov_n_u32(uint32_t a) {
9955   return vmov_n_u32(a);
9956 }
9957 
9958 // CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 signext %a) #0 {
9959 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
9960 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
9961 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
9962 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
9963 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
9964 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
9965 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
9966 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
9967 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
test_vmov_n_s8(int8_t a)9968 int8x8_t test_vmov_n_s8(int8_t a) {
9969   return vmov_n_s8(a);
9970 }
9971 
9972 // CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 signext %a) #0 {
9973 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
9974 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
9975 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
9976 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
9977 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
test_vmov_n_s16(int16_t a)9978 int16x4_t test_vmov_n_s16(int16_t a) {
9979   return vmov_n_s16(a);
9980 }
9981 
9982 // CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 {
9983 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
9984 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
9985 // CHECK:   ret <2 x i32> [[VECINIT1_I]]
test_vmov_n_s32(int32_t a)9986 int32x2_t test_vmov_n_s32(int32_t a) {
9987   return vmov_n_s32(a);
9988 }
9989 
9990 // CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 signext %a) #0 {
9991 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
9992 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
9993 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
9994 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
9995 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
9996 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
9997 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
9998 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
9999 // CHECK:   ret <8 x i8> [[VECINIT7_I]]
test_vmov_n_p8(poly8_t a)10000 poly8x8_t test_vmov_n_p8(poly8_t a) {
10001   return vmov_n_p8(a);
10002 }
10003 
10004 // CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 signext %a) #0 {
10005 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
10006 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
10007 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
10008 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
10009 // CHECK:   ret <4 x i16> [[VECINIT3_I]]
test_vmov_n_p16(poly16_t a)10010 poly16x4_t test_vmov_n_p16(poly16_t a) {
10011   return vmov_n_p16(a);
10012 }
10013 
10014 // CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 {
10015 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
10016 // CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
10017 // CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
10018 // CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
10019 // CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
10020 // CHECK:   ret <4 x half> [[VECINIT3]]
test_vmov_n_f16(float16_t * a)10021 float16x4_t test_vmov_n_f16(float16_t *a) {
10022   return vmov_n_f16(*a);
10023 }
10024 
10025 // CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 {
10026 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
10027 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
10028 // CHECK:   ret <2 x float> [[VECINIT1_I]]
test_vmov_n_f32(float32_t a)10029 float32x2_t test_vmov_n_f32(float32_t a) {
10030   return vmov_n_f32(a);
10031 }
10032 
10033 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 zeroext %a) #0 {
10034 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
10035 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
10036 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
10037 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
10038 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
10039 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
10040 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
10041 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
10042 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
10043 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
10044 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
10045 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
10046 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
10047 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
10048 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
10049 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
10050 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
test_vmovq_n_u8(uint8_t a)10051 uint8x16_t test_vmovq_n_u8(uint8_t a) {
10052   return vmovq_n_u8(a);
10053 }
10054 
10055 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 zeroext %a) #0 {
10056 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
10057 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
10058 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
10059 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
10060 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
10061 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
10062 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
10063 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
10064 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
test_vmovq_n_u16(uint16_t a)10065 uint16x8_t test_vmovq_n_u16(uint16_t a) {
10066   return vmovq_n_u16(a);
10067 }
10068 
10069 // CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 {
10070 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
10071 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
10072 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
10073 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
10074 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
test_vmovq_n_u32(uint32_t a)10075 uint32x4_t test_vmovq_n_u32(uint32_t a) {
10076   return vmovq_n_u32(a);
10077 }
10078 
10079 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 signext %a) #0 {
10080 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
10081 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
10082 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
10083 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
10084 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
10085 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
10086 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
10087 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
10088 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
10089 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
10090 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
10091 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
10092 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
10093 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
10094 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
10095 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
10096 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
test_vmovq_n_s8(int8_t a)10097 int8x16_t test_vmovq_n_s8(int8_t a) {
10098   return vmovq_n_s8(a);
10099 }
10100 
10101 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 signext %a) #0 {
10102 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
10103 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
10104 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
10105 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
10106 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
10107 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
10108 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
10109 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
10110 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
test_vmovq_n_s16(int16_t a)10111 int16x8_t test_vmovq_n_s16(int16_t a) {
10112   return vmovq_n_s16(a);
10113 }
10114 
10115 // CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 {
10116 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
10117 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
10118 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
10119 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
10120 // CHECK:   ret <4 x i32> [[VECINIT3_I]]
test_vmovq_n_s32(int32_t a)10121 int32x4_t test_vmovq_n_s32(int32_t a) {
10122   return vmovq_n_s32(a);
10123 }
10124 
10125 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 signext %a) #0 {
10126 // CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
10127 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
10128 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
10129 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
10130 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
10131 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
10132 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
10133 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
10134 // CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
10135 // CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
10136 // CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
10137 // CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
10138 // CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
10139 // CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
10140 // CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
10141 // CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
10142 // CHECK:   ret <16 x i8> [[VECINIT15_I]]
test_vmovq_n_p8(poly8_t a)10143 poly8x16_t test_vmovq_n_p8(poly8_t a) {
10144   return vmovq_n_p8(a);
10145 }
10146 
10147 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 signext %a) #0 {
10148 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
10149 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
10150 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
10151 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
10152 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
10153 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
10154 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
10155 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
10156 // CHECK:   ret <8 x i16> [[VECINIT7_I]]
test_vmovq_n_p16(poly16_t a)10157 poly16x8_t test_vmovq_n_p16(poly16_t a) {
10158   return vmovq_n_p16(a);
10159 }
10160 
10161 // CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 {
10162 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
10163 // CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
10164 // CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
10165 // CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
10166 // CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
10167 // CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
10168 // CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
10169 // CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
10170 // CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
10171 // CHECK:   ret <8 x half> [[VECINIT7]]
test_vmovq_n_f16(float16_t * a)10172 float16x8_t test_vmovq_n_f16(float16_t *a) {
10173   return vmovq_n_f16(*a);
10174 }
10175 
10176 // CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 {
10177 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
10178 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
10179 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
10180 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
10181 // CHECK:   ret <4 x float> [[VECINIT3_I]]
test_vmovq_n_f32(float32_t a)10182 float32x4_t test_vmovq_n_f32(float32_t a) {
10183   return vmovq_n_f32(a);
10184 }
10185 
10186 // CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 {
10187 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
10188 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
10189 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vmov_n_s64(int64_t a)10190 int64x1_t test_vmov_n_s64(int64_t a) {
10191   int64x1_t tmp = vmov_n_s64(a);
10192   return vadd_s64(tmp, tmp);
10193 }
10194 
10195 // CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 {
10196 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
10197 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
10198 // CHECK:   ret <1 x i64> [[ADD_I]]
test_vmov_n_u64(uint64_t a)10199 uint64x1_t test_vmov_n_u64(uint64_t a) {
10200   uint64x1_t tmp = vmov_n_u64(a);
10201   return vadd_u64(tmp, tmp);
10202 }
10203 
10204 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 {
10205 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
10206 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
10207 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
test_vmovq_n_s64(int64_t a)10208 int64x2_t test_vmovq_n_s64(int64_t a) {
10209   return vmovq_n_s64(a);
10210 }
10211 
10212 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 {
10213 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
10214 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
10215 // CHECK:   ret <2 x i64> [[VECINIT1_I]]
test_vmovq_n_u64(uint64_t a)10216 uint64x2_t test_vmovq_n_u64(uint64_t a) {
10217   return vmovq_n_u64(a);
10218 }
10219 
10220 
10221 // CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10222 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
10223 // CHECK:   ret <8 x i8> [[MUL_I]]
test_vmul_s8(int8x8_t a,int8x8_t b)10224 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
10225   return vmul_s8(a, b);
10226 }
10227 
10228 // CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10229 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
10230 // CHECK:   ret <4 x i16> [[MUL_I]]
test_vmul_s16(int16x4_t a,int16x4_t b)10231 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
10232   return vmul_s16(a, b);
10233 }
10234 
10235 // CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10236 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
10237 // CHECK:   ret <2 x i32> [[MUL_I]]
test_vmul_s32(int32x2_t a,int32x2_t b)10238 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
10239   return vmul_s32(a, b);
10240 }
10241 
10242 // CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 {
10243 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
10244 // CHECK:   ret <2 x float> [[MUL_I]]
test_vmul_f32(float32x2_t a,float32x2_t b)10245 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
10246   return vmul_f32(a, b);
10247 }
10248 
10249 // CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 {
10250 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
10251 // CHECK:   ret <8 x i8> [[MUL_I]]
test_vmul_u8(uint8x8_t a,uint8x8_t b)10252 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
10253   return vmul_u8(a, b);
10254 }
10255 
10256 // CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10257 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
10258 // CHECK:   ret <4 x i16> [[MUL_I]]
test_vmul_u16(uint16x4_t a,uint16x4_t b)10259 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
10260   return vmul_u16(a, b);
10261 }
10262 
10263 // CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10264 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
10265 // CHECK:   ret <2 x i32> [[MUL_I]]
test_vmul_u32(uint32x2_t a,uint32x2_t b)10266 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
10267   return vmul_u32(a, b);
10268 }
10269 
10270 // CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
10271 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
10272 // CHECK:   ret <16 x i8> [[MUL_I]]
test_vmulq_s8(int8x16_t a,int8x16_t b)10273 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
10274   return vmulq_s8(a, b);
10275 }
10276 
10277 // CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
10278 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
10279 // CHECK:   ret <8 x i16> [[MUL_I]]
test_vmulq_s16(int16x8_t a,int16x8_t b)10280 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
10281   return vmulq_s16(a, b);
10282 }
10283 
10284 // CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
10285 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
10286 // CHECK:   ret <4 x i32> [[MUL_I]]
test_vmulq_s32(int32x4_t a,int32x4_t b)10287 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
10288   return vmulq_s32(a, b);
10289 }
10290 
10291 // CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
10292 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
10293 // CHECK:   ret <4 x float> [[MUL_I]]
test_vmulq_f32(float32x4_t a,float32x4_t b)10294 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
10295   return vmulq_f32(a, b);
10296 }
10297 
10298 // CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
10299 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
10300 // CHECK:   ret <16 x i8> [[MUL_I]]
test_vmulq_u8(uint8x16_t a,uint8x16_t b)10301 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
10302   return vmulq_u8(a, b);
10303 }
10304 
10305 // CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
10306 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
10307 // CHECK:   ret <8 x i16> [[MUL_I]]
test_vmulq_u16(uint16x8_t a,uint16x8_t b)10308 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
10309   return vmulq_u16(a, b);
10310 }
10311 
10312 // CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
10313 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
10314 // CHECK:   ret <4 x i32> [[MUL_I]]
test_vmulq_u32(uint32x4_t a,uint32x4_t b)10315 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
10316   return vmulq_u32(a, b);
10317 }
10318 
10319 
10320 // CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10321 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) #4
10322 // CHECK:   ret <8 x i16> [[VMULL_I]]
test_vmull_s8(int8x8_t a,int8x8_t b)10323 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
10324   return vmull_s8(a, b);
10325 }
10326 
10327 // CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10328 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10329 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10330 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10331 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10332 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10333 // CHECK:   ret <4 x i32> [[VMULL2_I]]
test_vmull_s16(int16x4_t a,int16x4_t b)10334 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
10335   return vmull_s16(a, b);
10336 }
10337 
10338 // CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10339 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10340 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10341 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10342 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10343 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10344 // CHECK:   ret <2 x i64> [[VMULL2_I]]
test_vmull_s32(int32x2_t a,int32x2_t b)10345 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
10346   return vmull_s32(a, b);
10347 }
10348 
10349 // CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
10350 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) #4
10351 // CHECK:   ret <8 x i16> [[VMULL_I]]
test_vmull_u8(uint8x8_t a,uint8x8_t b)10352 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
10353   return vmull_u8(a, b);
10354 }
10355 
10356 // CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10357 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10358 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10359 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10360 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10361 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10362 // CHECK:   ret <4 x i32> [[VMULL2_I]]
test_vmull_u16(uint16x4_t a,uint16x4_t b)10363 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
10364   return vmull_u16(a, b);
10365 }
10366 
10367 // CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10368 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10369 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10370 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10371 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10372 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10373 // CHECK:   ret <2 x i64> [[VMULL2_I]]
test_vmull_u32(uint32x2_t a,uint32x2_t b)10374 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
10375   return vmull_u32(a, b);
10376 }
10377 
10378 // CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
10379 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) #4
10380 // CHECK:   ret <8 x i16> [[VMULL_I]]
test_vmull_p8(poly8x8_t a,poly8x8_t b)10381 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
10382   return vmull_p8(a, b);
10383 }
10384 
10385 
10386 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10387 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10388 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10389 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
10390 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10391 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10392 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10393 // CHECK:   ret <4 x i32> [[VMULL2_I]]
test_vmull_lane_s16(int16x4_t a,int16x4_t b)10394 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
10395   return vmull_lane_s16(a, b, 3);
10396 }
10397 
10398 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10399 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10400 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10401 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
10402 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10403 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10404 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10405 // CHECK:   ret <2 x i64> [[VMULL2_I]]
test_vmull_lane_s32(int32x2_t a,int32x2_t b)10406 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
10407   return vmull_lane_s32(a, b, 1);
10408 }
10409 
10410 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10411 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10412 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10413 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
10414 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10415 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10416 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10417 // CHECK:   ret <4 x i32> [[VMULL2_I]]
test_vmull_lane_u16(uint16x4_t a,uint16x4_t b)10418 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
10419   return vmull_lane_u16(a, b, 3);
10420 }
10421 
10422 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10423 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10424 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10425 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
10426 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10427 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10428 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10429 // CHECK:   ret <2 x i64> [[VMULL2_I]]
test_vmull_lane_u32(uint32x2_t a,uint32x2_t b)10430 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
10431   return vmull_lane_u32(a, b, 1);
10432 }
10433 
10434 
10435 // CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
10436 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10437 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10438 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10439 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10440 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10441 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
10442 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10443 // CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10444 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
10445 // CHECK:   ret <4 x i32> [[VMULL5_I]]
test_vmull_n_s16(int16x4_t a,int16_t b)10446 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
10447   return vmull_n_s16(a, b);
10448 }
10449 
10450 // CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
10451 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10452 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10453 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10454 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
10455 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10456 // CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10457 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
10458 // CHECK:   ret <2 x i64> [[VMULL3_I]]
test_vmull_n_s32(int32x2_t a,int32_t b)10459 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
10460   return vmull_n_s32(a, b);
10461 }
10462 
10463 // CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
10464 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10465 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10466 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10467 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10468 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10469 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
10470 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10471 // CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10472 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
10473 // CHECK:   ret <4 x i32> [[VMULL5_I]]
test_vmull_n_u16(uint16x4_t a,uint16_t b)10474 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
10475   return vmull_n_u16(a, b);
10476 }
10477 
10478 // CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
10479 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10480 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10481 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10482 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
10483 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10484 // CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10485 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
10486 // CHECK:   ret <2 x i64> [[VMULL3_I]]
test_vmull_n_u32(uint32x2_t a,uint32_t b)10487 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
10488   return vmull_n_u32(a, b);
10489 }
10490 
10491 
10492 // CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 {
10493 // CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
10494 // CHECK:   ret <8 x i8> [[VMUL_V_I]]
test_vmul_p8(poly8x8_t a,poly8x8_t b)10495 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
10496   return vmul_p8(a, b);
10497 }
10498 
10499 // CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
10500 // CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
10501 // CHECK:   ret <16 x i8> [[VMULQ_V_I]]
test_vmulq_p8(poly8x16_t a,poly8x16_t b)10502 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
10503   return vmulq_p8(a, b);
10504 }
10505 
10506 
10507 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10508 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10509 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
10510 // CHECK:   ret <4 x i16> [[MUL]]
test_vmul_lane_s16(int16x4_t a,int16x4_t b)10511 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
10512   return vmul_lane_s16(a, b, 3);
10513 }
10514 
10515 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10516 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10517 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
10518 // CHECK:   ret <2 x i32> [[MUL]]
test_vmul_lane_s32(int32x2_t a,int32x2_t b)10519 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
10520   return vmul_lane_s32(a, b, 1);
10521 }
10522 
10523 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 {
10524 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
10525 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
10526 // CHECK:   ret <2 x float> [[MUL]]
test_vmul_lane_f32(float32x2_t a,float32x2_t b)10527 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
10528   return vmul_lane_f32(a, b, 1);
10529 }
10530 
10531 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10532 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10533 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
10534 // CHECK:   ret <4 x i16> [[MUL]]
test_vmul_lane_u16(uint16x4_t a,uint16x4_t b)10535 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
10536   return vmul_lane_u16(a, b, 3);
10537 }
10538 
10539 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10540 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10541 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
10542 // CHECK:   ret <2 x i32> [[MUL]]
test_vmul_lane_u32(uint32x2_t a,uint32x2_t b)10543 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
10544   return vmul_lane_u32(a, b, 1);
10545 }
10546 
10547 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
10548 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
10549 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
10550 // CHECK:   ret <8 x i16> [[MUL]]
test_vmulq_lane_s16(int16x8_t a,int16x4_t b)10551 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
10552   return vmulq_lane_s16(a, b, 3);
10553 }
10554 
10555 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
10556 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
10557 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
10558 // CHECK:   ret <4 x i32> [[MUL]]
test_vmulq_lane_s32(int32x4_t a,int32x2_t b)10559 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
10560   return vmulq_lane_s32(a, b, 1);
10561 }
10562 
10563 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 {
10564 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
10565 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
10566 // CHECK:   ret <4 x float> [[MUL]]
test_vmulq_lane_f32(float32x4_t a,float32x2_t b)10567 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
10568   return vmulq_lane_f32(a, b, 1);
10569 }
10570 
10571 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 {
10572 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
10573 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
10574 // CHECK:   ret <8 x i16> [[MUL]]
test_vmulq_lane_u16(uint16x8_t a,uint16x4_t b)10575 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
10576   return vmulq_lane_u16(a, b, 3);
10577 }
10578 
10579 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 {
10580 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
10581 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
10582 // CHECK:   ret <4 x i32> [[MUL]]
test_vmulq_lane_u32(uint32x4_t a,uint32x2_t b)10583 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
10584   return vmulq_lane_u32(a, b, 1);
10585 }
10586 
10587 
10588 // CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 signext %b) #0 {
10589 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10590 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10591 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10592 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10593 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
10594 // CHECK:   ret <4 x i16> [[MUL_I]]
test_vmul_n_s16(int16x4_t a,int16_t b)10595 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
10596   return vmul_n_s16(a, b);
10597 }
10598 
10599 // CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
10600 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10601 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10602 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
10603 // CHECK:   ret <2 x i32> [[MUL_I]]
test_vmul_n_s32(int32x2_t a,int32_t b)10604 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
10605   return vmul_n_s32(a, b);
10606 }
10607 
10608 // CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
10609 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
10610 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
10611 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
10612 // CHECK:   ret <2 x float> [[MUL_I]]
test_vmul_n_f32(float32x2_t a,float32_t b)10613 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
10614   return vmul_n_f32(a, b);
10615 }
10616 
10617 // CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
10618 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10619 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10620 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10621 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10622 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
10623 // CHECK:   ret <4 x i16> [[MUL_I]]
test_vmul_n_u16(uint16x4_t a,uint16_t b)10624 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
10625   return vmul_n_u16(a, b);
10626 }
10627 
10628 // CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
10629 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10630 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10631 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
10632 // CHECK:   ret <2 x i32> [[MUL_I]]
test_vmul_n_u32(uint32x2_t a,uint32_t b)10633 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
10634   return vmul_n_u32(a, b);
10635 }
10636 
10637 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
10638 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
10639 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
10640 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
10641 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
10642 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
10643 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
10644 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
10645 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
10646 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
10647 // CHECK:   ret <8 x i16> [[MUL_I]]
test_vmulq_n_s16(int16x8_t a,int16_t b)10648 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
10649   return vmulq_n_s16(a, b);
10650 }
10651 
10652 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
10653 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
10654 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
10655 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
10656 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
10657 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
10658 // CHECK:   ret <4 x i32> [[MUL_I]]
test_vmulq_n_s32(int32x4_t a,int32_t b)10659 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
10660   return vmulq_n_s32(a, b);
10661 }
10662 
10663 // CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
10664 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
10665 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
10666 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
10667 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
10668 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
10669 // CHECK:   ret <4 x float> [[MUL_I]]
test_vmulq_n_f32(float32x4_t a,float32_t b)10670 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
10671   return vmulq_n_f32(a, b);
10672 }
10673 
10674 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 zeroext %b) #0 {
10675 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
10676 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
10677 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
10678 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
10679 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
10680 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
10681 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
10682 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
10683 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
10684 // CHECK:   ret <8 x i16> [[MUL_I]]
test_vmulq_n_u16(uint16x8_t a,uint16_t b)10685 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
10686   return vmulq_n_u16(a, b);
10687 }
10688 
10689 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
10690 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
10691 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
10692 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
10693 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
10694 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
10695 // CHECK:   ret <4 x i32> [[MUL_I]]
test_vmulq_n_u32(uint32x4_t a,uint32_t b)10696 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
10697   return vmulq_n_u32(a, b);
10698 }
10699 
10700 
10701 // CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
10702 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10703 // CHECK:   ret <8 x i8> [[NEG_I]]
test_vmvn_s8(int8x8_t a)10704 int8x8_t test_vmvn_s8(int8x8_t a) {
10705   return vmvn_s8(a);
10706 }
10707 
10708 // CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
10709 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
10710 // CHECK:   ret <4 x i16> [[NEG_I]]
test_vmvn_s16(int16x4_t a)10711 int16x4_t test_vmvn_s16(int16x4_t a) {
10712   return vmvn_s16(a);
10713 }
10714 
10715 // CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
10716 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
10717 // CHECK:   ret <2 x i32> [[NEG_I]]
test_vmvn_s32(int32x2_t a)10718 int32x2_t test_vmvn_s32(int32x2_t a) {
10719   return vmvn_s32(a);
10720 }
10721 
10722 // CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
10723 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10724 // CHECK:   ret <8 x i8> [[NEG_I]]
test_vmvn_u8(uint8x8_t a)10725 uint8x8_t test_vmvn_u8(uint8x8_t a) {
10726   return vmvn_u8(a);
10727 }
10728 
10729 // CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
10730 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
10731 // CHECK:   ret <4 x i16> [[NEG_I]]
test_vmvn_u16(uint16x4_t a)10732 uint16x4_t test_vmvn_u16(uint16x4_t a) {
10733   return vmvn_u16(a);
10734 }
10735 
10736 // CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
10737 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
10738 // CHECK:   ret <2 x i32> [[NEG_I]]
test_vmvn_u32(uint32x2_t a)10739 uint32x2_t test_vmvn_u32(uint32x2_t a) {
10740   return vmvn_u32(a);
10741 }
10742 
10743 // CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
10744 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10745 // CHECK:   ret <8 x i8> [[NEG_I]]
test_vmvn_p8(poly8x8_t a)10746 poly8x8_t test_vmvn_p8(poly8x8_t a) {
10747   return vmvn_p8(a);
10748 }
10749 
10750 // CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
10751 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10752 // CHECK:   ret <16 x i8> [[NEG_I]]
test_vmvnq_s8(int8x16_t a)10753 int8x16_t test_vmvnq_s8(int8x16_t a) {
10754   return vmvnq_s8(a);
10755 }
10756 
10757 // CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
10758 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10759 // CHECK:   ret <8 x i16> [[NEG_I]]
test_vmvnq_s16(int16x8_t a)10760 int16x8_t test_vmvnq_s16(int16x8_t a) {
10761   return vmvnq_s16(a);
10762 }
10763 
10764 // CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
10765 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
10766 // CHECK:   ret <4 x i32> [[NEG_I]]
test_vmvnq_s32(int32x4_t a)10767 int32x4_t test_vmvnq_s32(int32x4_t a) {
10768   return vmvnq_s32(a);
10769 }
10770 
10771 // CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
10772 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10773 // CHECK:   ret <16 x i8> [[NEG_I]]
test_vmvnq_u8(uint8x16_t a)10774 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
10775   return vmvnq_u8(a);
10776 }
10777 
10778 // CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
10779 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10780 // CHECK:   ret <8 x i16> [[NEG_I]]
test_vmvnq_u16(uint16x8_t a)10781 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
10782   return vmvnq_u16(a);
10783 }
10784 
10785 // CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
10786 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
10787 // CHECK:   ret <4 x i32> [[NEG_I]]
test_vmvnq_u32(uint32x4_t a)10788 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
10789   return vmvnq_u32(a);
10790 }
10791 
10792 // CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
10793 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10794 // CHECK:   ret <16 x i8> [[NEG_I]]
test_vmvnq_p8(poly8x16_t a)10795 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
10796   return vmvnq_p8(a);
10797 }
10798 
10799 
10800 // CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
10801 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
10802 // CHECK:   ret <8 x i8> [[SUB_I]]
test_vneg_s8(int8x8_t a)10803 int8x8_t test_vneg_s8(int8x8_t a) {
10804   return vneg_s8(a);
10805 }
10806 
10807 // CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
10808 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
10809 // CHECK:   ret <4 x i16> [[SUB_I]]
test_vneg_s16(int16x4_t a)10810 int16x4_t test_vneg_s16(int16x4_t a) {
10811   return vneg_s16(a);
10812 }
10813 
10814 // CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
10815 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
10816 // CHECK:   ret <2 x i32> [[SUB_I]]
test_vneg_s32(int32x2_t a)10817 int32x2_t test_vneg_s32(int32x2_t a) {
10818   return vneg_s32(a);
10819 }
10820 
10821 // CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
10822 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
10823 // CHECK:   ret <2 x float> [[SUB_I]]
test_vneg_f32(float32x2_t a)10824 float32x2_t test_vneg_f32(float32x2_t a) {
10825   return vneg_f32(a);
10826 }
10827 
10828 // CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
10829 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
10830 // CHECK:   ret <16 x i8> [[SUB_I]]
test_vnegq_s8(int8x16_t a)10831 int8x16_t test_vnegq_s8(int8x16_t a) {
10832   return vnegq_s8(a);
10833 }
10834 
10835 // CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
10836 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
10837 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vnegq_s16(int16x8_t a)10838 int16x8_t test_vnegq_s16(int16x8_t a) {
10839   return vnegq_s16(a);
10840 }
10841 
10842 // CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
10843 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
10844 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vnegq_s32(int32x4_t a)10845 int32x4_t test_vnegq_s32(int32x4_t a) {
10846   return vnegq_s32(a);
10847 }
10848 
10849 // CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
10850 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
10851 // CHECK:   ret <4 x float> [[SUB_I]]
test_vnegq_f32(float32x4_t a)10852 float32x4_t test_vnegq_f32(float32x4_t a) {
10853   return vnegq_f32(a);
10854 }
10855 
10856 
10857 // CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10858 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10859 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
10860 // CHECK:   ret <8 x i8> [[OR_I]]
test_vorn_s8(int8x8_t a,int8x8_t b)10861 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
10862   return vorn_s8(a, b);
10863 }
10864 
10865 // CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10866 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
10867 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
10868 // CHECK:   ret <4 x i16> [[OR_I]]
test_vorn_s16(int16x4_t a,int16x4_t b)10869 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
10870   return vorn_s16(a, b);
10871 }
10872 
10873 // CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10874 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
10875 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
10876 // CHECK:   ret <2 x i32> [[OR_I]]
test_vorn_s32(int32x2_t a,int32x2_t b)10877 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
10878   return vorn_s32(a, b);
10879 }
10880 
10881 // CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
10882 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
10883 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
10884 // CHECK:   ret <1 x i64> [[OR_I]]
test_vorn_s64(int64x1_t a,int64x1_t b)10885 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
10886   return vorn_s64(a, b);
10887 }
10888 
10889 // CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
10890 // CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10891 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
10892 // CHECK:   ret <8 x i8> [[OR_I]]
test_vorn_u8(uint8x8_t a,uint8x8_t b)10893 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
10894   return vorn_u8(a, b);
10895 }
10896 
10897 // CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10898 // CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
10899 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
10900 // CHECK:   ret <4 x i16> [[OR_I]]
test_vorn_u16(uint16x4_t a,uint16x4_t b)10901 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
10902   return vorn_u16(a, b);
10903 }
10904 
10905 // CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10906 // CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
10907 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
10908 // CHECK:   ret <2 x i32> [[OR_I]]
test_vorn_u32(uint32x2_t a,uint32x2_t b)10909 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
10910   return vorn_u32(a, b);
10911 }
10912 
10913 // CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
10914 // CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
10915 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
10916 // CHECK:   ret <1 x i64> [[OR_I]]
test_vorn_u64(uint64x1_t a,uint64x1_t b)10917 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
10918   return vorn_u64(a, b);
10919 }
10920 
10921 // CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
10922 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10923 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
10924 // CHECK:   ret <16 x i8> [[OR_I]]
test_vornq_s8(int8x16_t a,int8x16_t b)10925 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
10926   return vornq_s8(a, b);
10927 }
10928 
10929 // CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
10930 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10931 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
10932 // CHECK:   ret <8 x i16> [[OR_I]]
test_vornq_s16(int16x8_t a,int16x8_t b)10933 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
10934   return vornq_s16(a, b);
10935 }
10936 
10937 // CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
10938 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
10939 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
10940 // CHECK:   ret <4 x i32> [[OR_I]]
test_vornq_s32(int32x4_t a,int32x4_t b)10941 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
10942   return vornq_s32(a, b);
10943 }
10944 
10945 // CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
10946 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
10947 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
10948 // CHECK:   ret <2 x i64> [[OR_I]]
test_vornq_s64(int64x2_t a,int64x2_t b)10949 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
10950   return vornq_s64(a, b);
10951 }
10952 
10953 // CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
10954 // CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10955 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
10956 // CHECK:   ret <16 x i8> [[OR_I]]
test_vornq_u8(uint8x16_t a,uint8x16_t b)10957 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
10958   return vornq_u8(a, b);
10959 }
10960 
10961 // CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
10962 // CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10963 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
10964 // CHECK:   ret <8 x i16> [[OR_I]]
test_vornq_u16(uint16x8_t a,uint16x8_t b)10965 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
10966   return vornq_u16(a, b);
10967 }
10968 
10969 // CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
10970 // CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
10971 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
10972 // CHECK:   ret <4 x i32> [[OR_I]]
test_vornq_u32(uint32x4_t a,uint32x4_t b)10973 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
10974   return vornq_u32(a, b);
10975 }
10976 
10977 // CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
10978 // CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
10979 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
10980 // CHECK:   ret <2 x i64> [[OR_I]]
test_vornq_u64(uint64x2_t a,uint64x2_t b)10981 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
10982   return vornq_u64(a, b);
10983 }
10984 
10985 
10986 // CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10987 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
10988 // CHECK:   ret <8 x i8> [[OR_I]]
test_vorr_s8(int8x8_t a,int8x8_t b)10989 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
10990   return vorr_s8(a, b);
10991 }
10992 
10993 // CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10994 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
10995 // CHECK:   ret <4 x i16> [[OR_I]]
test_vorr_s16(int16x4_t a,int16x4_t b)10996 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
10997   return vorr_s16(a, b);
10998 }
10999 
11000 // CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11001 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
11002 // CHECK:   ret <2 x i32> [[OR_I]]
test_vorr_s32(int32x2_t a,int32x2_t b)11003 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
11004   return vorr_s32(a, b);
11005 }
11006 
11007 // CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
11008 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
11009 // CHECK:   ret <1 x i64> [[OR_I]]
test_vorr_s64(int64x1_t a,int64x1_t b)11010 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
11011   return vorr_s64(a, b);
11012 }
11013 
11014 // CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11015 // CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
11016 // CHECK:   ret <8 x i8> [[OR_I]]
test_vorr_u8(uint8x8_t a,uint8x8_t b)11017 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
11018   return vorr_u8(a, b);
11019 }
11020 
11021 // CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11022 // CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
11023 // CHECK:   ret <4 x i16> [[OR_I]]
test_vorr_u16(uint16x4_t a,uint16x4_t b)11024 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
11025   return vorr_u16(a, b);
11026 }
11027 
11028 // CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11029 // CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
11030 // CHECK:   ret <2 x i32> [[OR_I]]
test_vorr_u32(uint32x2_t a,uint32x2_t b)11031 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
11032   return vorr_u32(a, b);
11033 }
11034 
11035 // CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
11036 // CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
11037 // CHECK:   ret <1 x i64> [[OR_I]]
test_vorr_u64(uint64x1_t a,uint64x1_t b)11038 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
11039   return vorr_u64(a, b);
11040 }
11041 
11042 // CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
11043 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
11044 // CHECK:   ret <16 x i8> [[OR_I]]
test_vorrq_s8(int8x16_t a,int8x16_t b)11045 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
11046   return vorrq_s8(a, b);
11047 }
11048 
11049 // CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
11050 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
11051 // CHECK:   ret <8 x i16> [[OR_I]]
test_vorrq_s16(int16x8_t a,int16x8_t b)11052 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
11053   return vorrq_s16(a, b);
11054 }
11055 
11056 // CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
11057 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
11058 // CHECK:   ret <4 x i32> [[OR_I]]
test_vorrq_s32(int32x4_t a,int32x4_t b)11059 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
11060   return vorrq_s32(a, b);
11061 }
11062 
11063 // CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
11064 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
11065 // CHECK:   ret <2 x i64> [[OR_I]]
test_vorrq_s64(int64x2_t a,int64x2_t b)11066 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
11067   return vorrq_s64(a, b);
11068 }
11069 
11070 // CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
11071 // CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
11072 // CHECK:   ret <16 x i8> [[OR_I]]
test_vorrq_u8(uint8x16_t a,uint8x16_t b)11073 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
11074   return vorrq_u8(a, b);
11075 }
11076 
11077 // CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
11078 // CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
11079 // CHECK:   ret <8 x i16> [[OR_I]]
test_vorrq_u16(uint16x8_t a,uint16x8_t b)11080 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
11081   return vorrq_u16(a, b);
11082 }
11083 
11084 // CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
11085 // CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
11086 // CHECK:   ret <4 x i32> [[OR_I]]
test_vorrq_u32(uint32x4_t a,uint32x4_t b)11087 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
11088   return vorrq_u32(a, b);
11089 }
11090 
11091 // CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
11092 // CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
11093 // CHECK:   ret <2 x i64> [[OR_I]]
test_vorrq_u64(uint64x2_t a,uint64x2_t b)11094 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
11095   return vorrq_u64(a, b);
11096 }
11097 
11098 
11099 // CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
11100 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11101 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11102 // CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
11103 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
test_vpadal_s8(int16x4_t a,int8x8_t b)11104 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
11105   return vpadal_s8(a, b);
11106 }
11107 
11108 // CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
11109 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11110 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11111 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11112 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11113 // CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
11114 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
test_vpadal_s16(int32x2_t a,int16x4_t b)11115 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
11116   return vpadal_s16(a, b);
11117 }
11118 
11119 // CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
11120 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11121 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11122 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11123 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11124 // CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
11125 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
test_vpadal_s32(int64x1_t a,int32x2_t b)11126 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
11127   return vpadal_s32(a, b);
11128 }
11129 
11130 // CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
11131 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11132 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11133 // CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
11134 // CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
test_vpadal_u8(uint16x4_t a,uint8x8_t b)11135 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
11136   return vpadal_u8(a, b);
11137 }
11138 
11139 // CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
11140 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11141 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11142 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11143 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11144 // CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
11145 // CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
test_vpadal_u16(uint32x2_t a,uint16x4_t b)11146 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
11147   return vpadal_u16(a, b);
11148 }
11149 
11150 // CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
11151 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11152 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11153 // CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11154 // CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11155 // CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
11156 // CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
test_vpadal_u32(uint64x1_t a,uint32x2_t b)11157 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
11158   return vpadal_u32(a, b);
11159 }
11160 
11161 // CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
11162 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11163 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11164 // CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
11165 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
test_vpadalq_s8(int16x8_t a,int8x16_t b)11166 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
11167   return vpadalq_s8(a, b);
11168 }
11169 
11170 // CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
11171 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11172 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11173 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11174 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11175 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
11176 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
test_vpadalq_s16(int32x4_t a,int16x8_t b)11177 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
11178   return vpadalq_s16(a, b);
11179 }
11180 
11181 // CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
11182 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11183 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11184 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11185 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11186 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
11187 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
test_vpadalq_s32(int64x2_t a,int32x4_t b)11188 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
11189   return vpadalq_s32(a, b);
11190 }
11191 
11192 // CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
11193 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11194 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11195 // CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
11196 // CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
test_vpadalq_u8(uint16x8_t a,uint8x16_t b)11197 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
11198   return vpadalq_u8(a, b);
11199 }
11200 
11201 // CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
11202 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11203 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11204 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11205 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11206 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
11207 // CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
test_vpadalq_u16(uint32x4_t a,uint16x8_t b)11208 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
11209   return vpadalq_u16(a, b);
11210 }
11211 
11212 // CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
11213 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11214 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11215 // CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11216 // CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11217 // CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
11218 // CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
test_vpadalq_u32(uint64x2_t a,uint32x4_t b)11219 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
11220   return vpadalq_u32(a, b);
11221 }
11222 
11223 
11224 // CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11225 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11226 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
test_vpadd_s8(int8x8_t a,int8x8_t b)11227 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
11228   return vpadd_s8(a, b);
11229 }
11230 
11231 // CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11232 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11233 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11234 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11235 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11236 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
11237 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
11238 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
11239 // CHECK:   ret <4 x i16> [[TMP2]]
test_vpadd_s16(int16x4_t a,int16x4_t b)11240 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
11241   return vpadd_s16(a, b);
11242 }
11243 
11244 // CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11245 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11246 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11247 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11248 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11249 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
11250 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
11251 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
11252 // CHECK:   ret <2 x i32> [[TMP2]]
test_vpadd_s32(int32x2_t a,int32x2_t b)11253 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
11254   return vpadd_s32(a, b);
11255 }
11256 
11257 // CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11258 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11259 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
test_vpadd_u8(uint8x8_t a,uint8x8_t b)11260 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
11261   return vpadd_u8(a, b);
11262 }
11263 
11264 // CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11265 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11266 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11267 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11268 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11269 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
11270 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
11271 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
11272 // CHECK:   ret <4 x i16> [[TMP2]]
test_vpadd_u16(uint16x4_t a,uint16x4_t b)11273 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
11274   return vpadd_u16(a, b);
11275 }
11276 
11277 // CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11278 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11279 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11280 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11281 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11282 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
11283 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
11284 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
11285 // CHECK:   ret <2 x i32> [[TMP2]]
test_vpadd_u32(uint32x2_t a,uint32x2_t b)11286 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
11287   return vpadd_u32(a, b);
11288 }
11289 
11290 // CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
11291 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11292 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
11293 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
11294 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
11295 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
11296 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
11297 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
11298 // CHECK:   ret <2 x float> [[TMP2]]
test_vpadd_f32(float32x2_t a,float32x2_t b)11299 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
11300   return vpadd_f32(a, b);
11301 }
11302 
11303 
11304 // CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
11305 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
11306 // CHECK:   ret <4 x i16> [[VPADDL_I]]
test_vpaddl_s8(int8x8_t a)11307 int16x4_t test_vpaddl_s8(int8x8_t a) {
11308   return vpaddl_s8(a);
11309 }
11310 
11311 // CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
11312 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11313 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11314 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
11315 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
test_vpaddl_s16(int16x4_t a)11316 int32x2_t test_vpaddl_s16(int16x4_t a) {
11317   return vpaddl_s16(a);
11318 }
11319 
11320 // CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
11321 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11322 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11323 // CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
11324 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
test_vpaddl_s32(int32x2_t a)11325 int64x1_t test_vpaddl_s32(int32x2_t a) {
11326   return vpaddl_s32(a);
11327 }
11328 
11329 // CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
11330 // CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
11331 // CHECK:   ret <4 x i16> [[VPADDL_I]]
test_vpaddl_u8(uint8x8_t a)11332 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
11333   return vpaddl_u8(a);
11334 }
11335 
11336 // CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
11337 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11338 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11339 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
11340 // CHECK:   ret <2 x i32> [[VPADDL1_I]]
test_vpaddl_u16(uint16x4_t a)11341 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
11342   return vpaddl_u16(a);
11343 }
11344 
11345 // CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
11346 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11347 // CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11348 // CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
11349 // CHECK:   ret <1 x i64> [[VPADDL1_I]]
test_vpaddl_u32(uint32x2_t a)11350 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
11351   return vpaddl_u32(a);
11352 }
11353 
11354 // CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
11355 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
11356 // CHECK:   ret <8 x i16> [[VPADDL_I]]
test_vpaddlq_s8(int8x16_t a)11357 int16x8_t test_vpaddlq_s8(int8x16_t a) {
11358   return vpaddlq_s8(a);
11359 }
11360 
11361 // CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
11362 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11363 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11364 // CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
11365 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
test_vpaddlq_s16(int16x8_t a)11366 int32x4_t test_vpaddlq_s16(int16x8_t a) {
11367   return vpaddlq_s16(a);
11368 }
11369 
11370 // CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
11371 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11372 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11373 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
11374 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
test_vpaddlq_s32(int32x4_t a)11375 int64x2_t test_vpaddlq_s32(int32x4_t a) {
11376   return vpaddlq_s32(a);
11377 }
11378 
11379 // CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
11380 // CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
11381 // CHECK:   ret <8 x i16> [[VPADDL_I]]
test_vpaddlq_u8(uint8x16_t a)11382 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
11383   return vpaddlq_u8(a);
11384 }
11385 
11386 // CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
11387 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11388 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11389 // CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
11390 // CHECK:   ret <4 x i32> [[VPADDL1_I]]
test_vpaddlq_u16(uint16x8_t a)11391 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
11392   return vpaddlq_u16(a);
11393 }
11394 
11395 // CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
11396 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11397 // CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11398 // CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
11399 // CHECK:   ret <2 x i64> [[VPADDL1_I]]
test_vpaddlq_u32(uint32x4_t a)11400 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
11401   return vpaddlq_u32(a);
11402 }
11403 
11404 
11405 // CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11406 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11407 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
test_vpmax_s8(int8x8_t a,int8x8_t b)11408 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
11409   return vpmax_s8(a, b);
11410 }
11411 
11412 // CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11413 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11414 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11415 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11416 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11417 // CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
11418 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
11419 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
11420 // CHECK:   ret <4 x i16> [[TMP2]]
test_vpmax_s16(int16x4_t a,int16x4_t b)11421 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
11422   return vpmax_s16(a, b);
11423 }
11424 
11425 // CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11426 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11427 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11428 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11429 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11430 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
11431 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
11432 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
11433 // CHECK:   ret <2 x i32> [[TMP2]]
test_vpmax_s32(int32x2_t a,int32x2_t b)11434 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
11435   return vpmax_s32(a, b);
11436 }
11437 
11438 // CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11439 // CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11440 // CHECK:   ret <8 x i8> [[VPMAX_V_I]]
test_vpmax_u8(uint8x8_t a,uint8x8_t b)11441 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
11442   return vpmax_u8(a, b);
11443 }
11444 
11445 // CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11446 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11447 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11448 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11449 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11450 // CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
11451 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
11452 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
11453 // CHECK:   ret <4 x i16> [[TMP2]]
test_vpmax_u16(uint16x4_t a,uint16x4_t b)11454 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
11455   return vpmax_u16(a, b);
11456 }
11457 
11458 // CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11459 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11460 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11461 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11462 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11463 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
11464 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
11465 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
11466 // CHECK:   ret <2 x i32> [[TMP2]]
test_vpmax_u32(uint32x2_t a,uint32x2_t b)11467 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
11468   return vpmax_u32(a, b);
11469 }
11470 
11471 // CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
11472 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11473 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
11474 // CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
11475 // CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
11476 // CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[VPMAX_V_I]], <2 x float> [[VPMAX_V1_I]]) #4
11477 // CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
11478 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x float>
11479 // CHECK:   ret <2 x float> [[TMP2]]
test_vpmax_f32(float32x2_t a,float32x2_t b)11480 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
11481   return vpmax_f32(a, b);
11482 }
11483 
11484 
11485 // CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11486 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11487 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
test_vpmin_s8(int8x8_t a,int8x8_t b)11488 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
11489   return vpmin_s8(a, b);
11490 }
11491 
11492 // CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11493 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11494 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11495 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11496 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11497 // CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
11498 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
11499 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
11500 // CHECK:   ret <4 x i16> [[TMP2]]
test_vpmin_s16(int16x4_t a,int16x4_t b)11501 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
11502   return vpmin_s16(a, b);
11503 }
11504 
11505 // CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11506 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11507 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11508 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11509 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11510 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
11511 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
11512 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
11513 // CHECK:   ret <2 x i32> [[TMP2]]
test_vpmin_s32(int32x2_t a,int32x2_t b)11514 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
11515   return vpmin_s32(a, b);
11516 }
11517 
11518 // CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11519 // CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11520 // CHECK:   ret <8 x i8> [[VPMIN_V_I]]
test_vpmin_u8(uint8x8_t a,uint8x8_t b)11521 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
11522   return vpmin_u8(a, b);
11523 }
11524 
11525 // CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11526 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11527 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11528 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11529 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11530 // CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
11531 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
11532 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
11533 // CHECK:   ret <4 x i16> [[TMP2]]
test_vpmin_u16(uint16x4_t a,uint16x4_t b)11534 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
11535   return vpmin_u16(a, b);
11536 }
11537 
11538 // CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11539 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11540 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11541 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11542 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11543 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
11544 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
11545 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
11546 // CHECK:   ret <2 x i32> [[TMP2]]
test_vpmin_u32(uint32x2_t a,uint32x2_t b)11547 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
11548   return vpmin_u32(a, b);
11549 }
11550 
11551 // CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
11552 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11553 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
11554 // CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
11555 // CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
11556 // CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[VPMIN_V_I]], <2 x float> [[VPMIN_V1_I]]) #4
11557 // CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
11558 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x float>
11559 // CHECK:   ret <2 x float> [[TMP2]]
test_vpmin_f32(float32x2_t a,float32x2_t b)11560 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
11561   return vpmin_f32(a, b);
11562 }
11563 
11564 
11565 // CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
11566 // CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
11567 // CHECK:   ret <8 x i8> [[VQABS_V_I]]
test_vqabs_s8(int8x8_t a)11568 int8x8_t test_vqabs_s8(int8x8_t a) {
11569   return vqabs_s8(a);
11570 }
11571 
11572 // CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
11573 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11574 // CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11575 // CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4
11576 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
11577 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
11578 // CHECK:   ret <4 x i16> [[TMP1]]
test_vqabs_s16(int16x4_t a)11579 int16x4_t test_vqabs_s16(int16x4_t a) {
11580   return vqabs_s16(a);
11581 }
11582 
11583 // CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
11584 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11585 // CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11586 // CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4
11587 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
11588 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
11589 // CHECK:   ret <2 x i32> [[TMP1]]
test_vqabs_s32(int32x2_t a)11590 int32x2_t test_vqabs_s32(int32x2_t a) {
11591   return vqabs_s32(a);
11592 }
11593 
11594 // CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
11595 // CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
11596 // CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
test_vqabsq_s8(int8x16_t a)11597 int8x16_t test_vqabsq_s8(int8x16_t a) {
11598   return vqabsq_s8(a);
11599 }
11600 
11601 // CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
11602 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11603 // CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11604 // CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4
11605 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
11606 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
11607 // CHECK:   ret <8 x i16> [[TMP1]]
test_vqabsq_s16(int16x8_t a)11608 int16x8_t test_vqabsq_s16(int16x8_t a) {
11609   return vqabsq_s16(a);
11610 }
11611 
11612 // CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
11613 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11614 // CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11615 // CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4
11616 // CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
11617 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
11618 // CHECK:   ret <4 x i32> [[TMP1]]
test_vqabsq_s32(int32x4_t a)11619 int32x4_t test_vqabsq_s32(int32x4_t a) {
11620   return vqabsq_s32(a);
11621 }
11622 
11623 
11624 // CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11625 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11626 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
test_vqadd_s8(int8x8_t a,int8x8_t b)11627 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
11628   return vqadd_s8(a, b);
11629 }
11630 
11631 // CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11632 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11633 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11634 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11635 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11636 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
11637 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
11638 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
11639 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqadd_s16(int16x4_t a,int16x4_t b)11640 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
11641   return vqadd_s16(a, b);
11642 }
11643 
11644 // CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11645 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11646 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11647 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11648 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11649 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
11650 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
11651 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
11652 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqadd_s32(int32x2_t a,int32x2_t b)11653 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
11654   return vqadd_s32(a, b);
11655 }
11656 
11657 // CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
11658 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11659 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
11660 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11661 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
11662 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
11663 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
11664 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
11665 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqadd_s64(int64x1_t a,int64x1_t b)11666 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
11667   return vqadd_s64(a, b);
11668 }
11669 
11670 // CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11671 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11672 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
test_vqadd_u8(uint8x8_t a,uint8x8_t b)11673 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
11674   return vqadd_u8(a, b);
11675 }
11676 
11677 // CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11678 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11679 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11680 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11681 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11682 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
11683 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
11684 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
11685 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqadd_u16(uint16x4_t a,uint16x4_t b)11686 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
11687   return vqadd_u16(a, b);
11688 }
11689 
11690 // CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11691 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11692 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11693 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11694 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11695 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
11696 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
11697 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
11698 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqadd_u32(uint32x2_t a,uint32x2_t b)11699 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
11700   return vqadd_u32(a, b);
11701 }
11702 
11703 // CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
11704 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11705 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
11706 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11707 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
11708 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
11709 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
11710 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
11711 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqadd_u64(uint64x1_t a,uint64x1_t b)11712 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
11713   return vqadd_u64(a, b);
11714 }
11715 
11716 // CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
11717 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
11718 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
test_vqaddq_s8(int8x16_t a,int8x16_t b)11719 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
11720   return vqaddq_s8(a, b);
11721 }
11722 
11723 // CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
11724 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11725 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11726 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11727 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11728 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
11729 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
11730 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
11731 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqaddq_s16(int16x8_t a,int16x8_t b)11732 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
11733   return vqaddq_s16(a, b);
11734 }
11735 
11736 // CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
11737 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11738 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11739 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11740 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11741 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
11742 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
11743 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
11744 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqaddq_s32(int32x4_t a,int32x4_t b)11745 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
11746   return vqaddq_s32(a, b);
11747 }
11748 
11749 // CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
11750 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11751 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11752 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11753 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
11754 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
11755 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
11756 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
11757 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqaddq_s64(int64x2_t a,int64x2_t b)11758 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
11759   return vqaddq_s64(a, b);
11760 }
11761 
11762 // CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
11763 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
11764 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
test_vqaddq_u8(uint8x16_t a,uint8x16_t b)11765 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
11766   return vqaddq_u8(a, b);
11767 }
11768 
11769 // CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
11770 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11771 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11772 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11773 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11774 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
11775 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
11776 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
11777 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqaddq_u16(uint16x8_t a,uint16x8_t b)11778 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
11779   return vqaddq_u16(a, b);
11780 }
11781 
11782 // CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
11783 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11784 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11785 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11786 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11787 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
11788 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
11789 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
11790 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqaddq_u32(uint32x4_t a,uint32x4_t b)11791 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
11792   return vqaddq_u32(a, b);
11793 }
11794 
11795 // CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
11796 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11797 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11798 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11799 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
11800 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
11801 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
11802 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
11803 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqaddq_u64(uint64x2_t a,uint64x2_t b)11804 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
11805   return vqaddq_u64(a, b);
11806 }
11807 
11808 
11809 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11810 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11811 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11812 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
11813 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11814 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11815 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11816 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11817 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11818 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)11819 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11820   return vqdmlal_s16(a, b, c);
11821 }
11822 
11823 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11824 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11825 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11826 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
11827 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11828 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11829 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11830 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11831 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11832 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_s32(int64x2_t a,int32x2_t b,int32x2_t c)11833 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11834   return vqdmlal_s32(a, b, c);
11835 }
11836 
11837 
11838 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11839 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
11840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11841 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11842 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
11843 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11844 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11845 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11846 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11847 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11848 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
test_vqdmlal_lane_s16(int32x4_t a,int16x4_t b,int16x4_t c)11849 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11850   return vqdmlal_lane_s16(a, b, c, 3);
11851 }
11852 
11853 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11854 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
11855 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11856 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11857 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
11858 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11859 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11860 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11861 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11862 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11863 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
test_vqdmlal_lane_s32(int64x2_t a,int32x2_t b,int32x2_t c)11864 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11865   return vqdmlal_lane_s32(a, b, c, 1);
11866 }
11867 
11868 
11869 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
11870 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11871 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11872 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
11873 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
11874 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
11875 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
11876 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
11877 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11878 // CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11879 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
11880 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11881 // CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
11882 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
test_vqdmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)11883 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
11884   return vqdmlal_n_s16(a, b, c);
11885 }
11886 
11887 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
11888 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11889 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11890 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
11891 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
11892 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
11893 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11894 // CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11895 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
11896 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11897 // CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
11898 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
test_vqdmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)11899 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
11900   return vqdmlal_n_s32(a, b, c);
11901 }
11902 
11903 
11904 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11905 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11906 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11907 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
11908 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11909 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11910 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11911 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11912 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11913 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)11914 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11915   return vqdmlsl_s16(a, b, c);
11916 }
11917 
11918 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11919 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11920 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11921 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
11922 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11923 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11924 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11925 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11926 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11927 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_s32(int64x2_t a,int32x2_t b,int32x2_t c)11928 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11929   return vqdmlsl_s32(a, b, c);
11930 }
11931 
11932 
11933 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11934 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
11935 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11936 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11937 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
11938 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11939 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11940 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11941 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11942 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11943 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
test_vqdmlsl_lane_s16(int32x4_t a,int16x4_t b,int16x4_t c)11944 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11945   return vqdmlsl_lane_s16(a, b, c, 3);
11946 }
11947 
11948 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11949 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
11950 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11951 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11952 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
11953 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11954 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11955 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11956 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11957 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11958 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
test_vqdmlsl_lane_s32(int64x2_t a,int32x2_t b,int32x2_t c)11959 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11960   return vqdmlsl_lane_s32(a, b, c, 1);
11961 }
11962 
11963 
11964 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
11965 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11966 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11967 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
11968 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
11969 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
11970 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
11971 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
11972 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11973 // CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11974 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
11975 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11976 // CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
11977 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
test_vqdmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)11978 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
11979   return vqdmlsl_n_s16(a, b, c);
11980 }
11981 
11982 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
11983 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11984 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11985 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
11986 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
11987 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
11988 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11989 // CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11990 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
11991 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11992 // CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
11993 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
test_vqdmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)11994 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
11995   return vqdmlsl_n_s32(a, b, c);
11996 }
11997 
11998 
11999 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12000 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12001 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12002 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12003 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12004 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
12005 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
12006 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
12007 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqdmulh_s16(int16x4_t a,int16x4_t b)12008 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
12009   return vqdmulh_s16(a, b);
12010 }
12011 
12012 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12013 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12014 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12015 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12016 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12017 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
12018 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
12019 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
12020 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqdmulh_s32(int32x2_t a,int32x2_t b)12021 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
12022   return vqdmulh_s32(a, b);
12023 }
12024 
12025 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12026 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12027 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12028 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12029 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12030 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
12031 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
12032 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
12033 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqdmulhq_s16(int16x8_t a,int16x8_t b)12034 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
12035   return vqdmulhq_s16(a, b);
12036 }
12037 
12038 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12039 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12040 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12041 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12042 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12043 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
12044 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
12045 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
12046 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqdmulhq_s32(int32x4_t a,int32x4_t b)12047 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
12048   return vqdmulhq_s32(a, b);
12049 }
12050 
12051 
12052 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12053 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
12054 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12055 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
12056 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12057 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12058 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
12059 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
12060 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
12061 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqdmulh_lane_s16(int16x4_t a,int16x4_t b)12062 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
12063   return vqdmulh_lane_s16(a, b, 3);
12064 }
12065 
12066 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12067 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
12068 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12069 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
12070 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12071 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12072 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
12073 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
12074 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
12075 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqdmulh_lane_s32(int32x2_t a,int32x2_t b)12076 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
12077   return vqdmulh_lane_s32(a, b, 1);
12078 }
12079 
12080 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
12081 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
12082 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12083 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
12084 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12085 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12086 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
12087 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
12088 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
12089 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqdmulhq_lane_s16(int16x8_t a,int16x4_t b)12090 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
12091   return vqdmulhq_lane_s16(a, b, 3);
12092 }
12093 
12094 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
12095 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
12096 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12097 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
12098 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12099 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12100 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
12101 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
12102 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
12103 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqdmulhq_lane_s32(int32x4_t a,int32x2_t b)12104 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
12105   return vqdmulhq_lane_s32(a, b, 1);
12106 }
12107 
12108 
12109 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
12110 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12111 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
12112 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
12113 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
12114 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
12115 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
12116 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12117 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12118 // CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4
12119 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
12120 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
12121 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqdmulh_n_s16(int16x4_t a,int16_t b)12122 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
12123   return vqdmulh_n_s16(a, b);
12124 }
12125 
12126 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
12127 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12128 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
12129 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
12130 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
12131 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12132 // CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12133 // CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4
12134 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
12135 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
12136 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqdmulh_n_s32(int32x2_t a,int32_t b)12137 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
12138   return vqdmulh_n_s32(a, b);
12139 }
12140 
12141 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
12142 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12143 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
12144 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
12145 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
12146 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
12147 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
12148 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
12149 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
12150 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
12151 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
12152 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12153 // CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12154 // CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4
12155 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
12156 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
12157 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqdmulhq_n_s16(int16x8_t a,int16_t b)12158 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
12159   return vqdmulhq_n_s16(a, b);
12160 }
12161 
12162 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
12163 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12164 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
12165 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
12166 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
12167 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
12168 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
12169 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12170 // CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12171 // CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4
12172 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
12173 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
12174 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqdmulhq_n_s32(int32x4_t a,int32_t b)12175 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
12176   return vqdmulhq_n_s32(a, b);
12177 }
12178 
12179 
12180 // CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12181 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12182 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12183 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12184 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12185 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
12186 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
12187 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
12188 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqdmull_s16(int16x4_t a,int16x4_t b)12189 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
12190   return vqdmull_s16(a, b);
12191 }
12192 
12193 // CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12194 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12195 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12196 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12197 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12198 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
12199 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
12200 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
12201 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqdmull_s32(int32x2_t a,int32x2_t b)12202 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
12203   return vqdmull_s32(a, b);
12204 }
12205 
12206 
12207 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12208 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
12209 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12210 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
12211 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12212 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12213 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
12214 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
12215 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
12216 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqdmull_lane_s16(int16x4_t a,int16x4_t b)12217 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
12218   return vqdmull_lane_s16(a, b, 3);
12219 }
12220 
12221 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12222 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
12223 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12224 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
12225 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12226 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12227 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
12228 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
12229 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
12230 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqdmull_lane_s32(int32x2_t a,int32x2_t b)12231 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
12232   return vqdmull_lane_s32(a, b, 1);
12233 }
12234 
12235 
12236 // CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
12237 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12238 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
12239 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
12240 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
12241 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
12242 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
12243 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12244 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12245 // CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4
12246 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
12247 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
12248 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqdmull_n_s16(int16x4_t a,int16_t b)12249 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
12250   return vqdmull_n_s16(a, b);
12251 }
12252 
12253 // CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
12254 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12255 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
12256 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
12257 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
12258 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12259 // CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12260 // CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4
12261 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
12262 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
12263 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqdmull_n_s32(int32x2_t a,int32_t b)12264 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
12265   return vqdmull_n_s32(a, b);
12266 }
12267 
12268 
12269 // CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
12270 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12271 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12272 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
12273 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
test_vqmovn_s16(int16x8_t a)12274 int8x8_t test_vqmovn_s16(int16x8_t a) {
12275   return vqmovn_s16(a);
12276 }
12277 
12278 // CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
12279 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12280 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12281 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
12282 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
12283 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
12284 // CHECK:   ret <4 x i16> [[TMP1]]
test_vqmovn_s32(int32x4_t a)12285 int16x4_t test_vqmovn_s32(int32x4_t a) {
12286   return vqmovn_s32(a);
12287 }
12288 
12289 // CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
12290 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12291 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12292 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
12293 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
12294 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
12295 // CHECK:   ret <2 x i32> [[TMP1]]
test_vqmovn_s64(int64x2_t a)12296 int32x2_t test_vqmovn_s64(int64x2_t a) {
12297   return vqmovn_s64(a);
12298 }
12299 
12300 // CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
12301 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12302 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12303 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
12304 // CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
test_vqmovn_u16(uint16x8_t a)12305 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
12306   return vqmovn_u16(a);
12307 }
12308 
12309 // CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
12310 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12311 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12312 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
12313 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
12314 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
12315 // CHECK:   ret <4 x i16> [[TMP1]]
test_vqmovn_u32(uint32x4_t a)12316 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
12317   return vqmovn_u32(a);
12318 }
12319 
12320 // CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
12321 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12322 // CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12323 // CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
12324 // CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
12325 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
12326 // CHECK:   ret <2 x i32> [[TMP1]]
test_vqmovn_u64(uint64x2_t a)12327 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
12328   return vqmovn_u64(a);
12329 }
12330 
12331 
12332 // CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
12333 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12334 // CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12335 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4
12336 // CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
test_vqmovun_s16(int16x8_t a)12337 uint8x8_t test_vqmovun_s16(int16x8_t a) {
12338   return vqmovun_s16(a);
12339 }
12340 
12341 // CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
12342 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12343 // CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12344 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4
12345 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
12346 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
12347 // CHECK:   ret <4 x i16> [[TMP1]]
test_vqmovun_s32(int32x4_t a)12348 uint16x4_t test_vqmovun_s32(int32x4_t a) {
12349   return vqmovun_s32(a);
12350 }
12351 
12352 // CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
12353 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12354 // CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12355 // CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4
12356 // CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
12357 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
12358 // CHECK:   ret <2 x i32> [[TMP1]]
test_vqmovun_s64(int64x2_t a)12359 uint32x2_t test_vqmovun_s64(int64x2_t a) {
12360   return vqmovun_s64(a);
12361 }
12362 
12363 
12364 // CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
12365 // CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
12366 // CHECK:   ret <8 x i8> [[VQNEG_V_I]]
test_vqneg_s8(int8x8_t a)12367 int8x8_t test_vqneg_s8(int8x8_t a) {
12368   return vqneg_s8(a);
12369 }
12370 
12371 // CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
12372 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12373 // CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12374 // CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4
12375 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
12376 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
12377 // CHECK:   ret <4 x i16> [[TMP1]]
test_vqneg_s16(int16x4_t a)12378 int16x4_t test_vqneg_s16(int16x4_t a) {
12379   return vqneg_s16(a);
12380 }
12381 
12382 // CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
12383 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12384 // CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12385 // CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4
12386 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
12387 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
12388 // CHECK:   ret <2 x i32> [[TMP1]]
test_vqneg_s32(int32x2_t a)12389 int32x2_t test_vqneg_s32(int32x2_t a) {
12390   return vqneg_s32(a);
12391 }
12392 
12393 // CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
12394 // CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
12395 // CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
test_vqnegq_s8(int8x16_t a)12396 int8x16_t test_vqnegq_s8(int8x16_t a) {
12397   return vqnegq_s8(a);
12398 }
12399 
12400 // CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
12401 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12402 // CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12403 // CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4
12404 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
12405 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
12406 // CHECK:   ret <8 x i16> [[TMP1]]
test_vqnegq_s16(int16x8_t a)12407 int16x8_t test_vqnegq_s16(int16x8_t a) {
12408   return vqnegq_s16(a);
12409 }
12410 
12411 // CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
12412 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12413 // CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12414 // CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4
12415 // CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
12416 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
12417 // CHECK:   ret <4 x i32> [[TMP1]]
test_vqnegq_s32(int32x4_t a)12418 int32x4_t test_vqnegq_s32(int32x4_t a) {
12419   return vqnegq_s32(a);
12420 }
12421 
12422 
12423 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12424 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12425 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12426 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12427 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12428 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
12429 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
12430 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
12431 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqrdmulh_s16(int16x4_t a,int16x4_t b)12432 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
12433   return vqrdmulh_s16(a, b);
12434 }
12435 
12436 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12439 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12440 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12441 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
12442 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
12443 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
12444 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqrdmulh_s32(int32x2_t a,int32x2_t b)12445 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
12446   return vqrdmulh_s32(a, b);
12447 }
12448 
12449 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12450 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12451 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12452 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12453 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12454 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
12455 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
12456 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
12457 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqrdmulhq_s16(int16x8_t a,int16x8_t b)12458 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
12459   return vqrdmulhq_s16(a, b);
12460 }
12461 
12462 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12463 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12464 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12465 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12466 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12467 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
12468 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
12469 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
12470 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqrdmulhq_s32(int32x4_t a,int32x4_t b)12471 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
12472   return vqrdmulhq_s32(a, b);
12473 }
12474 
12475 
12476 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12477 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
12478 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12479 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
12480 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12481 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12482 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
12483 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
12484 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
12485 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqrdmulh_lane_s16(int16x4_t a,int16x4_t b)12486 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
12487   return vqrdmulh_lane_s16(a, b, 3);
12488 }
12489 
12490 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12491 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
12492 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12493 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
12494 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12495 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12496 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
12497 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
12498 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
12499 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqrdmulh_lane_s32(int32x2_t a,int32x2_t b)12500 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
12501   return vqrdmulh_lane_s32(a, b, 1);
12502 }
12503 
12504 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
12505 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
12506 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12507 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
12508 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12509 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12510 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
12511 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
12512 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
12513 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqrdmulhq_lane_s16(int16x8_t a,int16x4_t b)12514 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
12515   return vqrdmulhq_lane_s16(a, b, 3);
12516 }
12517 
12518 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
12519 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
12520 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12521 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
12522 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12523 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12524 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
12525 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
12526 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
12527 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqrdmulhq_lane_s32(int32x4_t a,int32x2_t b)12528 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
12529   return vqrdmulhq_lane_s32(a, b, 1);
12530 }
12531 
12532 
12533 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
12534 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12535 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
12536 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
12537 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
12538 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
12539 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
12540 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12541 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12542 // CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4
12543 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
12544 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
12545 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqrdmulh_n_s16(int16x4_t a,int16_t b)12546 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
12547   return vqrdmulh_n_s16(a, b);
12548 }
12549 
12550 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
12551 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12552 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
12553 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
12554 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
12555 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12556 // CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12557 // CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4
12558 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
12559 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
12560 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqrdmulh_n_s32(int32x2_t a,int32_t b)12561 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
12562   return vqrdmulh_n_s32(a, b);
12563 }
12564 
12565 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
12566 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12567 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
12568 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
12569 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
12570 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
12571 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
12572 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
12573 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
12574 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
12575 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
12576 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12577 // CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12578 // CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4
12579 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
12580 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
12581 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqrdmulhq_n_s16(int16x8_t a,int16_t b)12582 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
12583   return vqrdmulhq_n_s16(a, b);
12584 }
12585 
12586 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
12587 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12588 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
12589 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
12590 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
12591 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
12592 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
12593 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12594 // CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12595 // CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4
12596 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
12597 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
12598 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqrdmulhq_n_s32(int32x4_t a,int32_t b)12599 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
12600   return vqrdmulhq_n_s32(a, b);
12601 }
12602 
12603 
12604 // CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
12605 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12606 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
test_vqrshl_s8(int8x8_t a,int8x8_t b)12607 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
12608   return vqrshl_s8(a, b);
12609 }
12610 
12611 // CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12612 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12613 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12614 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12615 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12616 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
12617 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
12618 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
12619 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqrshl_s16(int16x4_t a,int16x4_t b)12620 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
12621   return vqrshl_s16(a, b);
12622 }
12623 
12624 // CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12625 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12626 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12627 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12628 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12629 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
12630 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
12631 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
12632 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqrshl_s32(int32x2_t a,int32x2_t b)12633 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
12634   return vqrshl_s32(a, b);
12635 }
12636 
12637 // CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
12638 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12639 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12640 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12641 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12642 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
12643 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
12644 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
12645 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqrshl_s64(int64x1_t a,int64x1_t b)12646 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
12647   return vqrshl_s64(a, b);
12648 }
12649 
12650 // CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
12651 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12652 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
test_vqrshl_u8(uint8x8_t a,int8x8_t b)12653 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
12654   return vqrshl_u8(a, b);
12655 }
12656 
12657 // CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
12658 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12659 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12660 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12661 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12662 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
12663 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
12664 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
12665 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqrshl_u16(uint16x4_t a,int16x4_t b)12666 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
12667   return vqrshl_u16(a, b);
12668 }
12669 
12670 // CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
12671 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12672 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12673 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12674 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12675 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
12676 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
12677 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
12678 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqrshl_u32(uint32x2_t a,int32x2_t b)12679 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
12680   return vqrshl_u32(a, b);
12681 }
12682 
12683 // CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
12684 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12685 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12686 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12687 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12688 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
12689 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
12690 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
12691 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqrshl_u64(uint64x1_t a,int64x1_t b)12692 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
12693   return vqrshl_u64(a, b);
12694 }
12695 
12696 // CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
12697 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
12698 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
test_vqrshlq_s8(int8x16_t a,int8x16_t b)12699 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
12700   return vqrshlq_s8(a, b);
12701 }
12702 
12703 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12704 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12705 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12706 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12707 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12708 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
12709 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
12710 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
12711 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqrshlq_s16(int16x8_t a,int16x8_t b)12712 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
12713   return vqrshlq_s16(a, b);
12714 }
12715 
12716 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12717 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12718 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12719 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12720 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12721 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
12722 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
12723 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
12724 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqrshlq_s32(int32x4_t a,int32x4_t b)12725 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
12726   return vqrshlq_s32(a, b);
12727 }
12728 
12729 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
12730 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12731 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
12732 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12733 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
12734 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
12735 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
12736 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
12737 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqrshlq_s64(int64x2_t a,int64x2_t b)12738 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
12739   return vqrshlq_s64(a, b);
12740 }
12741 
12742 // CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
12743 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
12744 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
test_vqrshlq_u8(uint8x16_t a,int8x16_t b)12745 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
12746   return vqrshlq_u8(a, b);
12747 }
12748 
12749 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
12750 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12751 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12752 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12753 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12754 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
12755 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
12756 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
12757 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqrshlq_u16(uint16x8_t a,int16x8_t b)12758 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
12759   return vqrshlq_u16(a, b);
12760 }
12761 
12762 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
12763 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12764 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12765 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12766 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12767 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
12768 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
12769 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
12770 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqrshlq_u32(uint32x4_t a,int32x4_t b)12771 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
12772   return vqrshlq_u32(a, b);
12773 }
12774 
12775 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
12776 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12777 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
12778 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12779 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
12780 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
12781 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
12782 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
12783 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqrshlq_u64(uint64x2_t a,int64x2_t b)12784 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
12785   return vqrshlq_u64(a, b);
12786 }
12787 
12788 
12789 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
12790 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12791 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12792 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
12793 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
test_vqrshrn_n_s16(int16x8_t a)12794 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
12795   return vqrshrn_n_s16(a, 1);
12796 }
12797 
12798 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
12799 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12800 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12801 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
12802 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
test_vqrshrn_n_s32(int32x4_t a)12803 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
12804   return vqrshrn_n_s32(a, 1);
12805 }
12806 
12807 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
12808 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12809 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12810 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
12811 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
test_vqrshrn_n_s64(int64x2_t a)12812 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
12813   return vqrshrn_n_s64(a, 1);
12814 }
12815 
12816 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
12817 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12818 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12819 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
12820 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
test_vqrshrn_n_u16(uint16x8_t a)12821 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
12822   return vqrshrn_n_u16(a, 1);
12823 }
12824 
12825 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
12826 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12827 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12828 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
12829 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
test_vqrshrn_n_u32(uint32x4_t a)12830 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
12831   return vqrshrn_n_u32(a, 1);
12832 }
12833 
12834 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
12835 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12836 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12837 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
12838 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
test_vqrshrn_n_u64(uint64x2_t a)12839 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
12840   return vqrshrn_n_u64(a, 1);
12841 }
12842 
12843 
12844 // CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
12845 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12846 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12847 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
12848 // CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
test_vqrshrun_n_s16(int16x8_t a)12849 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
12850   return vqrshrun_n_s16(a, 1);
12851 }
12852 
12853 // CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
12854 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12855 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12856 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
12857 // CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
test_vqrshrun_n_s32(int32x4_t a)12858 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
12859   return vqrshrun_n_s32(a, 1);
12860 }
12861 
12862 // CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
12863 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12864 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12865 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
12866 // CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
test_vqrshrun_n_s64(int64x2_t a)12867 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
12868   return vqrshrun_n_s64(a, 1);
12869 }
12870 
12871 
12872 // CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
12873 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12874 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
test_vqshl_s8(int8x8_t a,int8x8_t b)12875 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
12876   return vqshl_s8(a, b);
12877 }
12878 
12879 // CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12880 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12881 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12882 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12883 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12884 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
12885 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
12886 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
12887 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqshl_s16(int16x4_t a,int16x4_t b)12888 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
12889   return vqshl_s16(a, b);
12890 }
12891 
12892 // CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12893 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12894 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12895 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12896 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12897 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
12898 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
12899 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
12900 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqshl_s32(int32x2_t a,int32x2_t b)12901 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
12902   return vqshl_s32(a, b);
12903 }
12904 
12905 // CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
12906 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12907 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12908 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12909 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12910 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
12911 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
12912 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
12913 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqshl_s64(int64x1_t a,int64x1_t b)12914 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
12915   return vqshl_s64(a, b);
12916 }
12917 
12918 // CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
12919 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12920 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
test_vqshl_u8(uint8x8_t a,int8x8_t b)12921 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
12922   return vqshl_u8(a, b);
12923 }
12924 
12925 // CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
12926 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12927 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12928 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12929 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12930 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
12931 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
12932 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
12933 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqshl_u16(uint16x4_t a,int16x4_t b)12934 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
12935   return vqshl_u16(a, b);
12936 }
12937 
12938 // CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
12939 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12940 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12941 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12942 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12943 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
12944 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
12945 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
12946 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqshl_u32(uint32x2_t a,int32x2_t b)12947 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
12948   return vqshl_u32(a, b);
12949 }
12950 
12951 // CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
12952 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12953 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12954 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12955 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12956 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
12957 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
12958 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
12959 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqshl_u64(uint64x1_t a,int64x1_t b)12960 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
12961   return vqshl_u64(a, b);
12962 }
12963 
12964 // CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
12965 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
12966 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
test_vqshlq_s8(int8x16_t a,int8x16_t b)12967 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
12968   return vqshlq_s8(a, b);
12969 }
12970 
12971 // CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12972 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12973 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12974 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12975 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12976 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
12977 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
12978 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
12979 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqshlq_s16(int16x8_t a,int16x8_t b)12980 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
12981   return vqshlq_s16(a, b);
12982 }
12983 
12984 // CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12985 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12986 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12987 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12988 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12989 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
12990 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
12991 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
12992 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqshlq_s32(int32x4_t a,int32x4_t b)12993 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
12994   return vqshlq_s32(a, b);
12995 }
12996 
12997 // CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
12998 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12999 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13000 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13001 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13002 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
13003 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
13004 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
13005 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqshlq_s64(int64x2_t a,int64x2_t b)13006 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
13007   return vqshlq_s64(a, b);
13008 }
13009 
13010 // CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
13011 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
13012 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
test_vqshlq_u8(uint8x16_t a,int8x16_t b)13013 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
13014   return vqshlq_u8(a, b);
13015 }
13016 
13017 // CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
13018 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13019 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13020 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13021 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13022 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
13023 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
13024 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
13025 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqshlq_u16(uint16x8_t a,int16x8_t b)13026 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
13027   return vqshlq_u16(a, b);
13028 }
13029 
13030 // CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
13031 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13032 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13033 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13034 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13035 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
13036 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
13037 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
13038 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqshlq_u32(uint32x4_t a,int32x4_t b)13039 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
13040   return vqshlq_u32(a, b);
13041 }
13042 
13043 // CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
13044 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13045 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13046 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13047 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13048 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
13049 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
13050 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
13051 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqshlq_u64(uint64x2_t a,int64x2_t b)13052 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
13053   return vqshlq_u64(a, b);
13054 }
13055 
13056 
13057 // CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
13058 // CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13059 // CHECK:   ret <8 x i8> [[VQSHLU_N]]
test_vqshlu_n_s8(int8x8_t a)13060 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
13061   return vqshlu_n_s8(a, 1);
13062 }
13063 
13064 // CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
13065 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13066 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13067 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
13068 // CHECK:   ret <4 x i16> [[VQSHLU_N1]]
test_vqshlu_n_s16(int16x4_t a)13069 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
13070   return vqshlu_n_s16(a, 1);
13071 }
13072 
13073 // CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
13074 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13075 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13076 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
13077 // CHECK:   ret <2 x i32> [[VQSHLU_N1]]
test_vqshlu_n_s32(int32x2_t a)13078 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
13079   return vqshlu_n_s32(a, 1);
13080 }
13081 
13082 // CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
13083 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13084 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13085 // CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
13086 // CHECK:   ret <1 x i64> [[VQSHLU_N1]]
test_vqshlu_n_s64(int64x1_t a)13087 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
13088   return vqshlu_n_s64(a, 1);
13089 }
13090 
13091 // CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
13092 // CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13093 // CHECK:   ret <16 x i8> [[VQSHLU_N]]
test_vqshluq_n_s8(int8x16_t a)13094 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
13095   return vqshluq_n_s8(a, 1);
13096 }
13097 
13098 // CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
13099 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13100 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13101 // CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
13102 // CHECK:   ret <8 x i16> [[VQSHLU_N1]]
test_vqshluq_n_s16(int16x8_t a)13103 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
13104   return vqshluq_n_s16(a, 1);
13105 }
13106 
13107 // CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
13108 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13109 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13110 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
13111 // CHECK:   ret <4 x i32> [[VQSHLU_N1]]
test_vqshluq_n_s32(int32x4_t a)13112 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
13113   return vqshluq_n_s32(a, 1);
13114 }
13115 
13116 // CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
13117 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13118 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13119 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
13120 // CHECK:   ret <2 x i64> [[VQSHLU_N1]]
test_vqshluq_n_s64(int64x2_t a)13121 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
13122   return vqshluq_n_s64(a, 1);
13123 }
13124 
13125 
13126 // CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
13127 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13128 // CHECK:   ret <8 x i8> [[VQSHL_N]]
test_vqshl_n_s8(int8x8_t a)13129 int8x8_t test_vqshl_n_s8(int8x8_t a) {
13130   return vqshl_n_s8(a, 1);
13131 }
13132 
13133 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
13134 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13135 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13136 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
13137 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
test_vqshl_n_s16(int16x4_t a)13138 int16x4_t test_vqshl_n_s16(int16x4_t a) {
13139   return vqshl_n_s16(a, 1);
13140 }
13141 
13142 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
13143 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13144 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13145 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
13146 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
test_vqshl_n_s32(int32x2_t a)13147 int32x2_t test_vqshl_n_s32(int32x2_t a) {
13148   return vqshl_n_s32(a, 1);
13149 }
13150 
13151 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
13152 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13153 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13154 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
13155 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
test_vqshl_n_s64(int64x1_t a)13156 int64x1_t test_vqshl_n_s64(int64x1_t a) {
13157   return vqshl_n_s64(a, 1);
13158 }
13159 
13160 // CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
13161 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13162 // CHECK:   ret <8 x i8> [[VQSHL_N]]
test_vqshl_n_u8(uint8x8_t a)13163 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
13164   return vqshl_n_u8(a, 1);
13165 }
13166 
13167 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
13168 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13169 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13170 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
13171 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
test_vqshl_n_u16(uint16x4_t a)13172 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
13173   return vqshl_n_u16(a, 1);
13174 }
13175 
13176 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
13177 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13178 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13179 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
13180 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
test_vqshl_n_u32(uint32x2_t a)13181 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
13182   return vqshl_n_u32(a, 1);
13183 }
13184 
13185 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
13186 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13187 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13188 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
13189 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
test_vqshl_n_u64(uint64x1_t a)13190 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
13191   return vqshl_n_u64(a, 1);
13192 }
13193 
13194 // CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
13195 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13196 // CHECK:   ret <16 x i8> [[VQSHL_N]]
test_vqshlq_n_s8(int8x16_t a)13197 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
13198   return vqshlq_n_s8(a, 1);
13199 }
13200 
13201 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
13202 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13203 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13204 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
13205 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
test_vqshlq_n_s16(int16x8_t a)13206 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
13207   return vqshlq_n_s16(a, 1);
13208 }
13209 
13210 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
13211 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13212 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13213 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
13214 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
test_vqshlq_n_s32(int32x4_t a)13215 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
13216   return vqshlq_n_s32(a, 1);
13217 }
13218 
13219 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
13220 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13221 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13222 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
13223 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
test_vqshlq_n_s64(int64x2_t a)13224 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
13225   return vqshlq_n_s64(a, 1);
13226 }
13227 
13228 // CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
13229 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13230 // CHECK:   ret <16 x i8> [[VQSHL_N]]
test_vqshlq_n_u8(uint8x16_t a)13231 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
13232   return vqshlq_n_u8(a, 1);
13233 }
13234 
13235 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
13236 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13237 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13238 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
13239 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
test_vqshlq_n_u16(uint16x8_t a)13240 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
13241   return vqshlq_n_u16(a, 1);
13242 }
13243 
13244 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
13245 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13246 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13247 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
13248 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
test_vqshlq_n_u32(uint32x4_t a)13249 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
13250   return vqshlq_n_u32(a, 1);
13251 }
13252 
13253 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
13254 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13255 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13256 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
13257 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
test_vqshlq_n_u64(uint64x2_t a)13258 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
13259   return vqshlq_n_u64(a, 1);
13260 }
13261 
13262 
13263 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
13264 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13265 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13266 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13267 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
test_vqshrn_n_s16(int16x8_t a)13268 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
13269   return vqshrn_n_s16(a, 1);
13270 }
13271 
13272 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
13273 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13274 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13275 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13276 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
test_vqshrn_n_s32(int32x4_t a)13277 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
13278   return vqshrn_n_s32(a, 1);
13279 }
13280 
13281 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
13282 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13283 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13284 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13285 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
test_vqshrn_n_s64(int64x2_t a)13286 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
13287   return vqshrn_n_s64(a, 1);
13288 }
13289 
13290 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
13291 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13292 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13293 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13294 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
test_vqshrn_n_u16(uint16x8_t a)13295 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
13296   return vqshrn_n_u16(a, 1);
13297 }
13298 
13299 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
13300 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13301 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13302 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13303 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
test_vqshrn_n_u32(uint32x4_t a)13304 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
13305   return vqshrn_n_u32(a, 1);
13306 }
13307 
13308 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
13309 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13310 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13311 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13312 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
test_vqshrn_n_u64(uint64x2_t a)13313 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
13314   return vqshrn_n_u64(a, 1);
13315 }
13316 
13317 
13318 // CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
13319 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13320 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13321 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13322 // CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
test_vqshrun_n_s16(int16x8_t a)13323 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
13324   return vqshrun_n_s16(a, 1);
13325 }
13326 
13327 // CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
13328 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13329 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13330 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13331 // CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
test_vqshrun_n_s32(int32x4_t a)13332 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
13333   return vqshrun_n_s32(a, 1);
13334 }
13335 
13336 // CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
13337 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13338 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13339 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
13340 // CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
test_vqshrun_n_s64(int64x2_t a)13341 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
13342   return vqshrun_n_s64(a, 1);
13343 }
13344 
13345 
13346 // CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
13347 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
13348 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
test_vqsub_s8(int8x8_t a,int8x8_t b)13349 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
13350   return vqsub_s8(a, b);
13351 }
13352 
13353 // CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
13354 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13355 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13356 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13357 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13358 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
13359 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
13360 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
13361 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqsub_s16(int16x4_t a,int16x4_t b)13362 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
13363   return vqsub_s16(a, b);
13364 }
13365 
13366 // CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
13367 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13368 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13369 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13370 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13371 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
13372 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
13373 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
13374 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqsub_s32(int32x2_t a,int32x2_t b)13375 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
13376   return vqsub_s32(a, b);
13377 }
13378 
13379 // CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
13380 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13381 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13382 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13383 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13384 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
13385 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
13386 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
13387 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqsub_s64(int64x1_t a,int64x1_t b)13388 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
13389   return vqsub_s64(a, b);
13390 }
13391 
13392 // CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
13393 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
13394 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
test_vqsub_u8(uint8x8_t a,uint8x8_t b)13395 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
13396   return vqsub_u8(a, b);
13397 }
13398 
13399 // CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
13400 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13401 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13402 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13403 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13404 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
13405 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
13406 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
13407 // CHECK:   ret <4 x i16> [[TMP2]]
test_vqsub_u16(uint16x4_t a,uint16x4_t b)13408 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
13409   return vqsub_u16(a, b);
13410 }
13411 
13412 // CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
13413 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13414 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13415 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13416 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13417 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
13418 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
13419 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
13420 // CHECK:   ret <2 x i32> [[TMP2]]
test_vqsub_u32(uint32x2_t a,uint32x2_t b)13421 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
13422   return vqsub_u32(a, b);
13423 }
13424 
13425 // CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
13426 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13427 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13428 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13429 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13430 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
13431 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
13432 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
13433 // CHECK:   ret <1 x i64> [[TMP2]]
test_vqsub_u64(uint64x1_t a,uint64x1_t b)13434 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
13435   return vqsub_u64(a, b);
13436 }
13437 
13438 // CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
13439 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
13440 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
test_vqsubq_s8(int8x16_t a,int8x16_t b)13441 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
13442   return vqsubq_s8(a, b);
13443 }
13444 
13445 // CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
13446 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13447 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13448 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13449 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13450 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
13451 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
13452 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
13453 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqsubq_s16(int16x8_t a,int16x8_t b)13454 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
13455   return vqsubq_s16(a, b);
13456 }
13457 
13458 // CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
13459 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13460 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13461 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13462 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13463 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
13464 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
13465 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
13466 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqsubq_s32(int32x4_t a,int32x4_t b)13467 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
13468   return vqsubq_s32(a, b);
13469 }
13470 
13471 // CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
13472 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13473 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13474 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13475 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13476 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
13477 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
13478 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
13479 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqsubq_s64(int64x2_t a,int64x2_t b)13480 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
13481   return vqsubq_s64(a, b);
13482 }
13483 
13484 // CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
13485 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
13486 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
test_vqsubq_u8(uint8x16_t a,uint8x16_t b)13487 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
13488   return vqsubq_u8(a, b);
13489 }
13490 
13491 // CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
13492 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13493 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13494 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13495 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13496 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
13497 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
13498 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
13499 // CHECK:   ret <8 x i16> [[TMP2]]
test_vqsubq_u16(uint16x8_t a,uint16x8_t b)13500 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
13501   return vqsubq_u16(a, b);
13502 }
13503 
13504 // CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
13505 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13506 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13507 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13508 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13509 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
13510 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
13511 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
13512 // CHECK:   ret <4 x i32> [[TMP2]]
test_vqsubq_u32(uint32x4_t a,uint32x4_t b)13513 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
13514   return vqsubq_u32(a, b);
13515 }
13516 
13517 // CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
13518 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13519 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13520 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13521 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13522 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
13523 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
13524 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
13525 // CHECK:   ret <2 x i64> [[TMP2]]
test_vqsubq_u64(uint64x2_t a,uint64x2_t b)13526 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
13527   return vqsubq_u64(a, b);
13528 }
13529 
13530 
13531 // CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
13532 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13533 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13534 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13535 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13536 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
13537 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
test_vraddhn_s16(int16x8_t a,int16x8_t b)13538 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
13539   return vraddhn_s16(a, b);
13540 }
13541 
13542 // CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
13543 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13544 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13545 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13546 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13547 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
13548 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
13549 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
13550 // CHECK:   ret <4 x i16> [[TMP2]]
test_vraddhn_s32(int32x4_t a,int32x4_t b)13551 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
13552   return vraddhn_s32(a, b);
13553 }
13554 
13555 // CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
13556 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13557 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13558 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13559 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13560 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
13561 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
13562 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
13563 // CHECK:   ret <2 x i32> [[TMP2]]
test_vraddhn_s64(int64x2_t a,int64x2_t b)13564 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
13565   return vraddhn_s64(a, b);
13566 }
13567 
13568 // CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
13569 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13570 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13571 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13572 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13573 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
13574 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
test_vraddhn_u16(uint16x8_t a,uint16x8_t b)13575 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
13576   return vraddhn_u16(a, b);
13577 }
13578 
13579 // CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
13580 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13581 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13582 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13583 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13584 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
13585 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
13586 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
13587 // CHECK:   ret <4 x i16> [[TMP2]]
test_vraddhn_u32(uint32x4_t a,uint32x4_t b)13588 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
13589   return vraddhn_u32(a, b);
13590 }
13591 
13592 // CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
13593 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13594 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13595 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13596 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13597 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
13598 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
13599 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
13600 // CHECK:   ret <2 x i32> [[TMP2]]
test_vraddhn_u64(uint64x2_t a,uint64x2_t b)13601 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
13602   return vraddhn_u64(a, b);
13603 }
13604 
13605 
13606 // CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
13607 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13608 // CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
13609 // CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4
13610 // CHECK:   ret <2 x float> [[VRECPE_V1_I]]
test_vrecpe_f32(float32x2_t a)13611 float32x2_t test_vrecpe_f32(float32x2_t a) {
13612   return vrecpe_f32(a);
13613 }
13614 
13615 // CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
13616 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13617 // CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13618 // CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4
13619 // CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
test_vrecpe_u32(uint32x2_t a)13620 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
13621   return vrecpe_u32(a);
13622 }
13623 
13624 // CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
13625 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13626 // CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
13627 // CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4
13628 // CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
test_vrecpeq_f32(float32x4_t a)13629 float32x4_t test_vrecpeq_f32(float32x4_t a) {
13630   return vrecpeq_f32(a);
13631 }
13632 
13633 // CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
13634 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13635 // CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13636 // CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4
13637 // CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
test_vrecpeq_u32(uint32x4_t a)13638 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
13639   return vrecpeq_u32(a);
13640 }
13641 
13642 
13643 // CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 {
13644 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13645 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
13646 // CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
13647 // CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
13648 // CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
13649 // CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
13650 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
13651 // CHECK:   ret <2 x float> [[TMP2]]
test_vrecps_f32(float32x2_t a,float32x2_t b)13652 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
13653   return vrecps_f32(a, b);
13654 }
13655 
13656 // CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 {
13657 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13658 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
13659 // CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
13660 // CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
13661 // CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
13662 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
13663 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
13664 // CHECK:   ret <4 x float> [[TMP2]]
test_vrecpsq_f32(float32x4_t a,float32x4_t b)13665 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
13666   return vrecpsq_f32(a, b);
13667 }
13668 
13669 
13670 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
13671 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13672 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_s16(int16x4_t a)13673 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
13674   return vreinterpret_s8_s16(a);
13675 }
13676 
13677 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
13678 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13679 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_s32(int32x2_t a)13680 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
13681   return vreinterpret_s8_s32(a);
13682 }
13683 
13684 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
13685 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13686 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_s64(int64x1_t a)13687 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
13688   return vreinterpret_s8_s64(a);
13689 }
13690 
13691 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
13692 // CHECK:   ret <8 x i8> %a
test_vreinterpret_s8_u8(uint8x8_t a)13693 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
13694   return vreinterpret_s8_u8(a);
13695 }
13696 
13697 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
13698 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13699 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_u16(uint16x4_t a)13700 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
13701   return vreinterpret_s8_u16(a);
13702 }
13703 
13704 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
13705 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13706 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_u32(uint32x2_t a)13707 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
13708   return vreinterpret_s8_u32(a);
13709 }
13710 
13711 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
13712 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13713 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_u64(uint64x1_t a)13714 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
13715   return vreinterpret_s8_u64(a);
13716 }
13717 
13718 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
13719 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
13720 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_f16(float16x4_t a)13721 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
13722   return vreinterpret_s8_f16(a);
13723 }
13724 
13725 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
13726 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13727 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_f32(float32x2_t a)13728 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
13729   return vreinterpret_s8_f32(a);
13730 }
13731 
13732 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
13733 // CHECK:   ret <8 x i8> %a
test_vreinterpret_s8_p8(poly8x8_t a)13734 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
13735   return vreinterpret_s8_p8(a);
13736 }
13737 
13738 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
13739 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13740 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_s8_p16(poly16x4_t a)13741 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
13742   return vreinterpret_s8_p16(a);
13743 }
13744 
13745 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
13746 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
13747 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_s8(int8x8_t a)13748 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
13749   return vreinterpret_s16_s8(a);
13750 }
13751 
13752 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
13753 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
13754 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_s32(int32x2_t a)13755 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
13756   return vreinterpret_s16_s32(a);
13757 }
13758 
13759 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
13760 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
13761 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_s64(int64x1_t a)13762 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
13763   return vreinterpret_s16_s64(a);
13764 }
13765 
13766 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
13767 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
13768 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_u8(uint8x8_t a)13769 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
13770   return vreinterpret_s16_u8(a);
13771 }
13772 
13773 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
13774 // CHECK:   ret <4 x i16> %a
test_vreinterpret_s16_u16(uint16x4_t a)13775 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
13776   return vreinterpret_s16_u16(a);
13777 }
13778 
13779 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
13780 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
13781 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_u32(uint32x2_t a)13782 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
13783   return vreinterpret_s16_u32(a);
13784 }
13785 
13786 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
13787 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
13788 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_u64(uint64x1_t a)13789 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
13790   return vreinterpret_s16_u64(a);
13791 }
13792 
13793 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
13794 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
13795 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_f16(float16x4_t a)13796 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
13797   return vreinterpret_s16_f16(a);
13798 }
13799 
13800 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
13801 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
13802 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_f32(float32x2_t a)13803 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
13804   return vreinterpret_s16_f32(a);
13805 }
13806 
13807 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
13808 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
13809 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_s16_p8(poly8x8_t a)13810 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
13811   return vreinterpret_s16_p8(a);
13812 }
13813 
13814 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
13815 // CHECK:   ret <4 x i16> %a
test_vreinterpret_s16_p16(poly16x4_t a)13816 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
13817   return vreinterpret_s16_p16(a);
13818 }
13819 
13820 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
13821 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
13822 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_s8(int8x8_t a)13823 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
13824   return vreinterpret_s32_s8(a);
13825 }
13826 
13827 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
13828 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
13829 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_s16(int16x4_t a)13830 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
13831   return vreinterpret_s32_s16(a);
13832 }
13833 
13834 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
13835 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
13836 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_s64(int64x1_t a)13837 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
13838   return vreinterpret_s32_s64(a);
13839 }
13840 
13841 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
13842 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
13843 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_u8(uint8x8_t a)13844 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
13845   return vreinterpret_s32_u8(a);
13846 }
13847 
13848 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
13849 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
13850 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_u16(uint16x4_t a)13851 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
13852   return vreinterpret_s32_u16(a);
13853 }
13854 
13855 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
13856 // CHECK:   ret <2 x i32> %a
test_vreinterpret_s32_u32(uint32x2_t a)13857 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
13858   return vreinterpret_s32_u32(a);
13859 }
13860 
13861 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
13862 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
13863 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_u64(uint64x1_t a)13864 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
13865   return vreinterpret_s32_u64(a);
13866 }
13867 
13868 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
13869 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
13870 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_f16(float16x4_t a)13871 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
13872   return vreinterpret_s32_f16(a);
13873 }
13874 
13875 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
13876 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
13877 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_f32(float32x2_t a)13878 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
13879   return vreinterpret_s32_f32(a);
13880 }
13881 
13882 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
13883 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
13884 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_p8(poly8x8_t a)13885 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
13886   return vreinterpret_s32_p8(a);
13887 }
13888 
13889 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
13890 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
13891 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_s32_p16(poly16x4_t a)13892 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
13893   return vreinterpret_s32_p16(a);
13894 }
13895 
13896 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
13897 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
13898 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_s8(int8x8_t a)13899 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
13900   return vreinterpret_s64_s8(a);
13901 }
13902 
13903 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
13904 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
13905 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_s16(int16x4_t a)13906 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
13907   return vreinterpret_s64_s16(a);
13908 }
13909 
13910 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
13911 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
13912 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_s32(int32x2_t a)13913 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
13914   return vreinterpret_s64_s32(a);
13915 }
13916 
13917 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
13918 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
13919 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_u8(uint8x8_t a)13920 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
13921   return vreinterpret_s64_u8(a);
13922 }
13923 
13924 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
13925 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
13926 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_u16(uint16x4_t a)13927 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
13928   return vreinterpret_s64_u16(a);
13929 }
13930 
13931 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
13932 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
13933 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_u32(uint32x2_t a)13934 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
13935   return vreinterpret_s64_u32(a);
13936 }
13937 
13938 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
13939 // CHECK:   ret <1 x i64> %a
test_vreinterpret_s64_u64(uint64x1_t a)13940 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
13941   return vreinterpret_s64_u64(a);
13942 }
13943 
13944 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
13945 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
13946 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_f16(float16x4_t a)13947 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
13948   return vreinterpret_s64_f16(a);
13949 }
13950 
13951 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
13952 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
13953 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_f32(float32x2_t a)13954 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
13955   return vreinterpret_s64_f32(a);
13956 }
13957 
13958 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
13959 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
13960 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_p8(poly8x8_t a)13961 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
13962   return vreinterpret_s64_p8(a);
13963 }
13964 
13965 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
13966 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
13967 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_s64_p16(poly16x4_t a)13968 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
13969   return vreinterpret_s64_p16(a);
13970 }
13971 
13972 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
13973 // CHECK:   ret <8 x i8> %a
test_vreinterpret_u8_s8(int8x8_t a)13974 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
13975   return vreinterpret_u8_s8(a);
13976 }
13977 
13978 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
13979 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13980 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_s16(int16x4_t a)13981 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
13982   return vreinterpret_u8_s16(a);
13983 }
13984 
13985 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
13986 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13987 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_s32(int32x2_t a)13988 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
13989   return vreinterpret_u8_s32(a);
13990 }
13991 
13992 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
13993 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13994 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_s64(int64x1_t a)13995 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
13996   return vreinterpret_u8_s64(a);
13997 }
13998 
13999 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
14000 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14001 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_u16(uint16x4_t a)14002 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
14003   return vreinterpret_u8_u16(a);
14004 }
14005 
14006 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
14007 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14008 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_u32(uint32x2_t a)14009 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
14010   return vreinterpret_u8_u32(a);
14011 }
14012 
14013 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
14014 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14015 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_u64(uint64x1_t a)14016 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
14017   return vreinterpret_u8_u64(a);
14018 }
14019 
14020 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
14021 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
14022 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_f16(float16x4_t a)14023 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
14024   return vreinterpret_u8_f16(a);
14025 }
14026 
14027 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
14028 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
14029 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_f32(float32x2_t a)14030 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
14031   return vreinterpret_u8_f32(a);
14032 }
14033 
14034 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
14035 // CHECK:   ret <8 x i8> %a
test_vreinterpret_u8_p8(poly8x8_t a)14036 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
14037   return vreinterpret_u8_p8(a);
14038 }
14039 
14040 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
14041 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14042 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_u8_p16(poly16x4_t a)14043 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
14044   return vreinterpret_u8_p16(a);
14045 }
14046 
14047 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
14048 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14049 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_s8(int8x8_t a)14050 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
14051   return vreinterpret_u16_s8(a);
14052 }
14053 
14054 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
14055 // CHECK:   ret <4 x i16> %a
test_vreinterpret_u16_s16(int16x4_t a)14056 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
14057   return vreinterpret_u16_s16(a);
14058 }
14059 
14060 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
14061 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14062 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_s32(int32x2_t a)14063 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
14064   return vreinterpret_u16_s32(a);
14065 }
14066 
14067 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
14068 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14069 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_s64(int64x1_t a)14070 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
14071   return vreinterpret_u16_s64(a);
14072 }
14073 
14074 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
14075 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14076 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_u8(uint8x8_t a)14077 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
14078   return vreinterpret_u16_u8(a);
14079 }
14080 
14081 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
14082 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14083 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_u32(uint32x2_t a)14084 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
14085   return vreinterpret_u16_u32(a);
14086 }
14087 
14088 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
14089 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14090 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_u64(uint64x1_t a)14091 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
14092   return vreinterpret_u16_u64(a);
14093 }
14094 
14095 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
14096 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
14097 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_f16(float16x4_t a)14098 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
14099   return vreinterpret_u16_f16(a);
14100 }
14101 
14102 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
14103 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
14104 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_f32(float32x2_t a)14105 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
14106   return vreinterpret_u16_f32(a);
14107 }
14108 
14109 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
14110 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14111 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_u16_p8(poly8x8_t a)14112 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
14113   return vreinterpret_u16_p8(a);
14114 }
14115 
14116 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
14117 // CHECK:   ret <4 x i16> %a
test_vreinterpret_u16_p16(poly16x4_t a)14118 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
14119   return vreinterpret_u16_p16(a);
14120 }
14121 
14122 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
14123 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
14124 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_s8(int8x8_t a)14125 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
14126   return vreinterpret_u32_s8(a);
14127 }
14128 
14129 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
14130 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
14131 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_s16(int16x4_t a)14132 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
14133   return vreinterpret_u32_s16(a);
14134 }
14135 
14136 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
14137 // CHECK:   ret <2 x i32> %a
test_vreinterpret_u32_s32(int32x2_t a)14138 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
14139   return vreinterpret_u32_s32(a);
14140 }
14141 
14142 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
14143 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
14144 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_s64(int64x1_t a)14145 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
14146   return vreinterpret_u32_s64(a);
14147 }
14148 
14149 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
14150 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
14151 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_u8(uint8x8_t a)14152 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
14153   return vreinterpret_u32_u8(a);
14154 }
14155 
14156 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
14157 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
14158 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_u16(uint16x4_t a)14159 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
14160   return vreinterpret_u32_u16(a);
14161 }
14162 
14163 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
14164 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
14165 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_u64(uint64x1_t a)14166 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
14167   return vreinterpret_u32_u64(a);
14168 }
14169 
14170 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
14171 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
14172 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_f16(float16x4_t a)14173 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
14174   return vreinterpret_u32_f16(a);
14175 }
14176 
14177 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
14178 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
14179 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_f32(float32x2_t a)14180 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
14181   return vreinterpret_u32_f32(a);
14182 }
14183 
14184 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
14185 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
14186 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_p8(poly8x8_t a)14187 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
14188   return vreinterpret_u32_p8(a);
14189 }
14190 
14191 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
14192 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
14193 // CHECK:   ret <2 x i32> [[TMP0]]
test_vreinterpret_u32_p16(poly16x4_t a)14194 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
14195   return vreinterpret_u32_p16(a);
14196 }
14197 
14198 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
14199 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
14200 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_s8(int8x8_t a)14201 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
14202   return vreinterpret_u64_s8(a);
14203 }
14204 
14205 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
14206 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
14207 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_s16(int16x4_t a)14208 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
14209   return vreinterpret_u64_s16(a);
14210 }
14211 
14212 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
14213 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
14214 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_s32(int32x2_t a)14215 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
14216   return vreinterpret_u64_s32(a);
14217 }
14218 
14219 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
14220 // CHECK:   ret <1 x i64> %a
test_vreinterpret_u64_s64(int64x1_t a)14221 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
14222   return vreinterpret_u64_s64(a);
14223 }
14224 
14225 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
14226 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
14227 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_u8(uint8x8_t a)14228 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
14229   return vreinterpret_u64_u8(a);
14230 }
14231 
14232 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
14233 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
14234 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_u16(uint16x4_t a)14235 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
14236   return vreinterpret_u64_u16(a);
14237 }
14238 
14239 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
14240 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
14241 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_u32(uint32x2_t a)14242 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
14243   return vreinterpret_u64_u32(a);
14244 }
14245 
14246 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
14247 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
14248 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_f16(float16x4_t a)14249 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
14250   return vreinterpret_u64_f16(a);
14251 }
14252 
14253 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
14254 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
14255 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_f32(float32x2_t a)14256 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
14257   return vreinterpret_u64_f32(a);
14258 }
14259 
14260 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
14261 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
14262 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_p8(poly8x8_t a)14263 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
14264   return vreinterpret_u64_p8(a);
14265 }
14266 
14267 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
14268 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
14269 // CHECK:   ret <1 x i64> [[TMP0]]
test_vreinterpret_u64_p16(poly16x4_t a)14270 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
14271   return vreinterpret_u64_p16(a);
14272 }
14273 
14274 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
14275 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
14276 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_s8(int8x8_t a)14277 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
14278   return vreinterpret_f16_s8(a);
14279 }
14280 
14281 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
14282 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
14283 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_s16(int16x4_t a)14284 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
14285   return vreinterpret_f16_s16(a);
14286 }
14287 
14288 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
14289 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
14290 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_s32(int32x2_t a)14291 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
14292   return vreinterpret_f16_s32(a);
14293 }
14294 
14295 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
14296 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
14297 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_s64(int64x1_t a)14298 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
14299   return vreinterpret_f16_s64(a);
14300 }
14301 
14302 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
14303 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
14304 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_u8(uint8x8_t a)14305 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
14306   return vreinterpret_f16_u8(a);
14307 }
14308 
14309 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
14310 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
14311 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_u16(uint16x4_t a)14312 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
14313   return vreinterpret_f16_u16(a);
14314 }
14315 
14316 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
14317 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
14318 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_u32(uint32x2_t a)14319 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
14320   return vreinterpret_f16_u32(a);
14321 }
14322 
14323 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
14324 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
14325 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_u64(uint64x1_t a)14326 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
14327   return vreinterpret_f16_u64(a);
14328 }
14329 
14330 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
14331 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
14332 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_f32(float32x2_t a)14333 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
14334   return vreinterpret_f16_f32(a);
14335 }
14336 
14337 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
14338 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
14339 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_p8(poly8x8_t a)14340 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
14341   return vreinterpret_f16_p8(a);
14342 }
14343 
14344 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
14345 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
14346 // CHECK:   ret <4 x half> [[TMP0]]
test_vreinterpret_f16_p16(poly16x4_t a)14347 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
14348   return vreinterpret_f16_p16(a);
14349 }
14350 
14351 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
14352 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
14353 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_s8(int8x8_t a)14354 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
14355   return vreinterpret_f32_s8(a);
14356 }
14357 
14358 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
14359 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
14360 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_s16(int16x4_t a)14361 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
14362   return vreinterpret_f32_s16(a);
14363 }
14364 
14365 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
14366 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
14367 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_s32(int32x2_t a)14368 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
14369   return vreinterpret_f32_s32(a);
14370 }
14371 
14372 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
14373 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
14374 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_s64(int64x1_t a)14375 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
14376   return vreinterpret_f32_s64(a);
14377 }
14378 
14379 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
14380 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
14381 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_u8(uint8x8_t a)14382 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
14383   return vreinterpret_f32_u8(a);
14384 }
14385 
14386 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
14387 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
14388 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_u16(uint16x4_t a)14389 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
14390   return vreinterpret_f32_u16(a);
14391 }
14392 
14393 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
14394 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
14395 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_u32(uint32x2_t a)14396 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
14397   return vreinterpret_f32_u32(a);
14398 }
14399 
14400 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
14401 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
14402 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_u64(uint64x1_t a)14403 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
14404   return vreinterpret_f32_u64(a);
14405 }
14406 
14407 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
14408 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
14409 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_f16(float16x4_t a)14410 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
14411   return vreinterpret_f32_f16(a);
14412 }
14413 
14414 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
14415 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
14416 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_p8(poly8x8_t a)14417 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
14418   return vreinterpret_f32_p8(a);
14419 }
14420 
14421 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
14422 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
14423 // CHECK:   ret <2 x float> [[TMP0]]
test_vreinterpret_f32_p16(poly16x4_t a)14424 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
14425   return vreinterpret_f32_p16(a);
14426 }
14427 
14428 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
14429 // CHECK:   ret <8 x i8> %a
test_vreinterpret_p8_s8(int8x8_t a)14430 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
14431   return vreinterpret_p8_s8(a);
14432 }
14433 
14434 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
14435 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14436 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_s16(int16x4_t a)14437 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
14438   return vreinterpret_p8_s16(a);
14439 }
14440 
14441 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
14442 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14443 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_s32(int32x2_t a)14444 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
14445   return vreinterpret_p8_s32(a);
14446 }
14447 
14448 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
14449 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14450 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_s64(int64x1_t a)14451 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
14452   return vreinterpret_p8_s64(a);
14453 }
14454 
14455 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
14456 // CHECK:   ret <8 x i8> %a
test_vreinterpret_p8_u8(uint8x8_t a)14457 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
14458   return vreinterpret_p8_u8(a);
14459 }
14460 
14461 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
14462 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14463 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_u16(uint16x4_t a)14464 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
14465   return vreinterpret_p8_u16(a);
14466 }
14467 
14468 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
14469 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14470 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_u32(uint32x2_t a)14471 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
14472   return vreinterpret_p8_u32(a);
14473 }
14474 
14475 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
14476 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14477 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_u64(uint64x1_t a)14478 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
14479   return vreinterpret_p8_u64(a);
14480 }
14481 
14482 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
14483 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
14484 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_f16(float16x4_t a)14485 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
14486   return vreinterpret_p8_f16(a);
14487 }
14488 
14489 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
14490 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
14491 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_f32(float32x2_t a)14492 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
14493   return vreinterpret_p8_f32(a);
14494 }
14495 
14496 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
14497 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14498 // CHECK:   ret <8 x i8> [[TMP0]]
test_vreinterpret_p8_p16(poly16x4_t a)14499 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
14500   return vreinterpret_p8_p16(a);
14501 }
14502 
14503 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
14504 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14505 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_s8(int8x8_t a)14506 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
14507   return vreinterpret_p16_s8(a);
14508 }
14509 
14510 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
14511 // CHECK:   ret <4 x i16> %a
test_vreinterpret_p16_s16(int16x4_t a)14512 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
14513   return vreinterpret_p16_s16(a);
14514 }
14515 
14516 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
14517 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14518 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_s32(int32x2_t a)14519 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
14520   return vreinterpret_p16_s32(a);
14521 }
14522 
14523 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
14524 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14525 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_s64(int64x1_t a)14526 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
14527   return vreinterpret_p16_s64(a);
14528 }
14529 
14530 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
14531 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14532 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_u8(uint8x8_t a)14533 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
14534   return vreinterpret_p16_u8(a);
14535 }
14536 
14537 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
14538 // CHECK:   ret <4 x i16> %a
test_vreinterpret_p16_u16(uint16x4_t a)14539 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
14540   return vreinterpret_p16_u16(a);
14541 }
14542 
14543 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
14544 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14545 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_u32(uint32x2_t a)14546 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
14547   return vreinterpret_p16_u32(a);
14548 }
14549 
14550 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
14551 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14552 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_u64(uint64x1_t a)14553 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
14554   return vreinterpret_p16_u64(a);
14555 }
14556 
14557 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
14558 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
14559 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_f16(float16x4_t a)14560 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
14561   return vreinterpret_p16_f16(a);
14562 }
14563 
14564 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
14565 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
14566 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_f32(float32x2_t a)14567 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
14568   return vreinterpret_p16_f32(a);
14569 }
14570 
14571 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
14572 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14573 // CHECK:   ret <4 x i16> [[TMP0]]
test_vreinterpret_p16_p8(poly8x8_t a)14574 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
14575   return vreinterpret_p16_p8(a);
14576 }
14577 
14578 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
14579 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14580 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_s16(int16x8_t a)14581 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
14582   return vreinterpretq_s8_s16(a);
14583 }
14584 
14585 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
14586 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14587 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_s32(int32x4_t a)14588 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
14589   return vreinterpretq_s8_s32(a);
14590 }
14591 
14592 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
14593 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14594 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_s64(int64x2_t a)14595 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
14596   return vreinterpretq_s8_s64(a);
14597 }
14598 
14599 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
14600 // CHECK:   ret <16 x i8> %a
test_vreinterpretq_s8_u8(uint8x16_t a)14601 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
14602   return vreinterpretq_s8_u8(a);
14603 }
14604 
14605 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
14606 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14607 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_u16(uint16x8_t a)14608 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
14609   return vreinterpretq_s8_u16(a);
14610 }
14611 
14612 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
14613 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14614 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_u32(uint32x4_t a)14615 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
14616   return vreinterpretq_s8_u32(a);
14617 }
14618 
14619 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
14620 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14621 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_u64(uint64x2_t a)14622 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
14623   return vreinterpretq_s8_u64(a);
14624 }
14625 
14626 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
14627 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
14628 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_f16(float16x8_t a)14629 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
14630   return vreinterpretq_s8_f16(a);
14631 }
14632 
14633 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
14634 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
14635 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_f32(float32x4_t a)14636 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
14637   return vreinterpretq_s8_f32(a);
14638 }
14639 
14640 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
14641 // CHECK:   ret <16 x i8> %a
test_vreinterpretq_s8_p8(poly8x16_t a)14642 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
14643   return vreinterpretq_s8_p8(a);
14644 }
14645 
14646 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
14647 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14648 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_s8_p16(poly16x8_t a)14649 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
14650   return vreinterpretq_s8_p16(a);
14651 }
14652 
14653 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
14654 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14655 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_s8(int8x16_t a)14656 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
14657   return vreinterpretq_s16_s8(a);
14658 }
14659 
14660 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
14661 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14662 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_s32(int32x4_t a)14663 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
14664   return vreinterpretq_s16_s32(a);
14665 }
14666 
14667 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
14668 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14669 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_s64(int64x2_t a)14670 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
14671   return vreinterpretq_s16_s64(a);
14672 }
14673 
14674 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
14675 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14676 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_u8(uint8x16_t a)14677 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
14678   return vreinterpretq_s16_u8(a);
14679 }
14680 
14681 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
14682 // CHECK:   ret <8 x i16> %a
test_vreinterpretq_s16_u16(uint16x8_t a)14683 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
14684   return vreinterpretq_s16_u16(a);
14685 }
14686 
14687 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
14688 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14689 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_u32(uint32x4_t a)14690 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
14691   return vreinterpretq_s16_u32(a);
14692 }
14693 
14694 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
14695 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14696 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_u64(uint64x2_t a)14697 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
14698   return vreinterpretq_s16_u64(a);
14699 }
14700 
14701 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
14702 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
14703 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_f16(float16x8_t a)14704 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
14705   return vreinterpretq_s16_f16(a);
14706 }
14707 
14708 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
14709 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
14710 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_f32(float32x4_t a)14711 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
14712   return vreinterpretq_s16_f32(a);
14713 }
14714 
14715 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
14716 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14717 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_s16_p8(poly8x16_t a)14718 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
14719   return vreinterpretq_s16_p8(a);
14720 }
14721 
14722 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
14723 // CHECK:   ret <8 x i16> %a
test_vreinterpretq_s16_p16(poly16x8_t a)14724 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
14725   return vreinterpretq_s16_p16(a);
14726 }
14727 
14728 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
14729 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
14730 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_s8(int8x16_t a)14731 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
14732   return vreinterpretq_s32_s8(a);
14733 }
14734 
14735 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
14736 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
14737 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_s16(int16x8_t a)14738 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
14739   return vreinterpretq_s32_s16(a);
14740 }
14741 
14742 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
14743 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
14744 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_s64(int64x2_t a)14745 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
14746   return vreinterpretq_s32_s64(a);
14747 }
14748 
14749 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
14750 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
14751 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_u8(uint8x16_t a)14752 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
14753   return vreinterpretq_s32_u8(a);
14754 }
14755 
14756 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
14757 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
14758 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_u16(uint16x8_t a)14759 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
14760   return vreinterpretq_s32_u16(a);
14761 }
14762 
14763 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
14764 // CHECK:   ret <4 x i32> %a
test_vreinterpretq_s32_u32(uint32x4_t a)14765 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
14766   return vreinterpretq_s32_u32(a);
14767 }
14768 
14769 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
14770 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
14771 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_u64(uint64x2_t a)14772 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
14773   return vreinterpretq_s32_u64(a);
14774 }
14775 
14776 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
14777 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
14778 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_f16(float16x8_t a)14779 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
14780   return vreinterpretq_s32_f16(a);
14781 }
14782 
14783 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
14784 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
14785 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_f32(float32x4_t a)14786 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
14787   return vreinterpretq_s32_f32(a);
14788 }
14789 
14790 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
14791 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
14792 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_p8(poly8x16_t a)14793 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
14794   return vreinterpretq_s32_p8(a);
14795 }
14796 
14797 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
14798 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
14799 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_s32_p16(poly16x8_t a)14800 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
14801   return vreinterpretq_s32_p16(a);
14802 }
14803 
14804 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
14805 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
14806 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_s8(int8x16_t a)14807 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
14808   return vreinterpretq_s64_s8(a);
14809 }
14810 
14811 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
14812 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
14813 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_s16(int16x8_t a)14814 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
14815   return vreinterpretq_s64_s16(a);
14816 }
14817 
14818 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
14819 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
14820 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_s32(int32x4_t a)14821 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
14822   return vreinterpretq_s64_s32(a);
14823 }
14824 
14825 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
14826 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
14827 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_u8(uint8x16_t a)14828 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
14829   return vreinterpretq_s64_u8(a);
14830 }
14831 
14832 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
14833 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
14834 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_u16(uint16x8_t a)14835 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
14836   return vreinterpretq_s64_u16(a);
14837 }
14838 
14839 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
14840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
14841 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_u32(uint32x4_t a)14842 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
14843   return vreinterpretq_s64_u32(a);
14844 }
14845 
14846 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
14847 // CHECK:   ret <2 x i64> %a
test_vreinterpretq_s64_u64(uint64x2_t a)14848 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
14849   return vreinterpretq_s64_u64(a);
14850 }
14851 
14852 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
14853 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
14854 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_f16(float16x8_t a)14855 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
14856   return vreinterpretq_s64_f16(a);
14857 }
14858 
14859 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
14860 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
14861 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_f32(float32x4_t a)14862 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
14863   return vreinterpretq_s64_f32(a);
14864 }
14865 
14866 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
14867 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
14868 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_p8(poly8x16_t a)14869 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
14870   return vreinterpretq_s64_p8(a);
14871 }
14872 
14873 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
14874 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
14875 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_s64_p16(poly16x8_t a)14876 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
14877   return vreinterpretq_s64_p16(a);
14878 }
14879 
14880 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
14881 // CHECK:   ret <16 x i8> %a
test_vreinterpretq_u8_s8(int8x16_t a)14882 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
14883   return vreinterpretq_u8_s8(a);
14884 }
14885 
14886 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
14887 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14888 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_s16(int16x8_t a)14889 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
14890   return vreinterpretq_u8_s16(a);
14891 }
14892 
14893 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
14894 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14895 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_s32(int32x4_t a)14896 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
14897   return vreinterpretq_u8_s32(a);
14898 }
14899 
14900 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
14901 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14902 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_s64(int64x2_t a)14903 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
14904   return vreinterpretq_u8_s64(a);
14905 }
14906 
14907 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
14908 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14909 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_u16(uint16x8_t a)14910 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
14911   return vreinterpretq_u8_u16(a);
14912 }
14913 
14914 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
14915 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14916 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_u32(uint32x4_t a)14917 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
14918   return vreinterpretq_u8_u32(a);
14919 }
14920 
14921 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
14922 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14923 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_u64(uint64x2_t a)14924 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
14925   return vreinterpretq_u8_u64(a);
14926 }
14927 
14928 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
14929 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
14930 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_f16(float16x8_t a)14931 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
14932   return vreinterpretq_u8_f16(a);
14933 }
14934 
14935 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
14936 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
14937 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_f32(float32x4_t a)14938 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
14939   return vreinterpretq_u8_f32(a);
14940 }
14941 
14942 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
14943 // CHECK:   ret <16 x i8> %a
test_vreinterpretq_u8_p8(poly8x16_t a)14944 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
14945   return vreinterpretq_u8_p8(a);
14946 }
14947 
14948 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
14949 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14950 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_u8_p16(poly16x8_t a)14951 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
14952   return vreinterpretq_u8_p16(a);
14953 }
14954 
14955 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
14956 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14957 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_s8(int8x16_t a)14958 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
14959   return vreinterpretq_u16_s8(a);
14960 }
14961 
14962 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
14963 // CHECK:   ret <8 x i16> %a
test_vreinterpretq_u16_s16(int16x8_t a)14964 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
14965   return vreinterpretq_u16_s16(a);
14966 }
14967 
14968 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
14969 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14970 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_s32(int32x4_t a)14971 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
14972   return vreinterpretq_u16_s32(a);
14973 }
14974 
14975 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
14976 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14977 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_s64(int64x2_t a)14978 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
14979   return vreinterpretq_u16_s64(a);
14980 }
14981 
14982 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
14983 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14984 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_u8(uint8x16_t a)14985 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
14986   return vreinterpretq_u16_u8(a);
14987 }
14988 
14989 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
14990 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14991 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_u32(uint32x4_t a)14992 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
14993   return vreinterpretq_u16_u32(a);
14994 }
14995 
14996 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
14997 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14998 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_u64(uint64x2_t a)14999 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
15000   return vreinterpretq_u16_u64(a);
15001 }
15002 
15003 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
15004 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
15005 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_f16(float16x8_t a)15006 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
15007   return vreinterpretq_u16_f16(a);
15008 }
15009 
15010 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
15011 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
15012 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_f32(float32x4_t a)15013 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
15014   return vreinterpretq_u16_f32(a);
15015 }
15016 
15017 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
15018 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15019 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_u16_p8(poly8x16_t a)15020 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
15021   return vreinterpretq_u16_p8(a);
15022 }
15023 
15024 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
15025 // CHECK:   ret <8 x i16> %a
test_vreinterpretq_u16_p16(poly16x8_t a)15026 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
15027   return vreinterpretq_u16_p16(a);
15028 }
15029 
15030 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
15031 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
15032 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_s8(int8x16_t a)15033 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
15034   return vreinterpretq_u32_s8(a);
15035 }
15036 
15037 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
15038 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
15039 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_s16(int16x8_t a)15040 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
15041   return vreinterpretq_u32_s16(a);
15042 }
15043 
15044 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
15045 // CHECK:   ret <4 x i32> %a
test_vreinterpretq_u32_s32(int32x4_t a)15046 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
15047   return vreinterpretq_u32_s32(a);
15048 }
15049 
15050 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
15051 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
15052 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_s64(int64x2_t a)15053 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
15054   return vreinterpretq_u32_s64(a);
15055 }
15056 
15057 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
15058 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
15059 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_u8(uint8x16_t a)15060 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
15061   return vreinterpretq_u32_u8(a);
15062 }
15063 
15064 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
15065 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
15066 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_u16(uint16x8_t a)15067 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
15068   return vreinterpretq_u32_u16(a);
15069 }
15070 
15071 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
15072 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
15073 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_u64(uint64x2_t a)15074 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
15075   return vreinterpretq_u32_u64(a);
15076 }
15077 
15078 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
15079 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
15080 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_f16(float16x8_t a)15081 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
15082   return vreinterpretq_u32_f16(a);
15083 }
15084 
15085 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
15086 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
15087 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_f32(float32x4_t a)15088 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
15089   return vreinterpretq_u32_f32(a);
15090 }
15091 
15092 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
15093 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
15094 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_p8(poly8x16_t a)15095 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
15096   return vreinterpretq_u32_p8(a);
15097 }
15098 
15099 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
15100 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
15101 // CHECK:   ret <4 x i32> [[TMP0]]
test_vreinterpretq_u32_p16(poly16x8_t a)15102 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
15103   return vreinterpretq_u32_p16(a);
15104 }
15105 
15106 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
15107 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
15108 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_s8(int8x16_t a)15109 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
15110   return vreinterpretq_u64_s8(a);
15111 }
15112 
15113 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
15114 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
15115 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_s16(int16x8_t a)15116 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
15117   return vreinterpretq_u64_s16(a);
15118 }
15119 
15120 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
15121 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
15122 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_s32(int32x4_t a)15123 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
15124   return vreinterpretq_u64_s32(a);
15125 }
15126 
15127 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
15128 // CHECK:   ret <2 x i64> %a
test_vreinterpretq_u64_s64(int64x2_t a)15129 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
15130   return vreinterpretq_u64_s64(a);
15131 }
15132 
15133 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
15134 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
15135 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_u8(uint8x16_t a)15136 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
15137   return vreinterpretq_u64_u8(a);
15138 }
15139 
15140 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
15141 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
15142 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_u16(uint16x8_t a)15143 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
15144   return vreinterpretq_u64_u16(a);
15145 }
15146 
15147 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
15148 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
15149 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_u32(uint32x4_t a)15150 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
15151   return vreinterpretq_u64_u32(a);
15152 }
15153 
15154 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
15155 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
15156 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_f16(float16x8_t a)15157 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
15158   return vreinterpretq_u64_f16(a);
15159 }
15160 
15161 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
15162 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
15163 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_f32(float32x4_t a)15164 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
15165   return vreinterpretq_u64_f32(a);
15166 }
15167 
15168 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
15169 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
15170 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_p8(poly8x16_t a)15171 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
15172   return vreinterpretq_u64_p8(a);
15173 }
15174 
15175 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
15176 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
15177 // CHECK:   ret <2 x i64> [[TMP0]]
test_vreinterpretq_u64_p16(poly16x8_t a)15178 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
15179   return vreinterpretq_u64_p16(a);
15180 }
15181 
15182 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
15183 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
15184 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_s8(int8x16_t a)15185 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
15186   return vreinterpretq_f16_s8(a);
15187 }
15188 
15189 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
15190 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
15191 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_s16(int16x8_t a)15192 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
15193   return vreinterpretq_f16_s16(a);
15194 }
15195 
15196 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
15197 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
15198 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_s32(int32x4_t a)15199 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
15200   return vreinterpretq_f16_s32(a);
15201 }
15202 
15203 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
15204 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
15205 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_s64(int64x2_t a)15206 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
15207   return vreinterpretq_f16_s64(a);
15208 }
15209 
15210 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
15211 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
15212 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_u8(uint8x16_t a)15213 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
15214   return vreinterpretq_f16_u8(a);
15215 }
15216 
15217 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
15218 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
15219 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_u16(uint16x8_t a)15220 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
15221   return vreinterpretq_f16_u16(a);
15222 }
15223 
15224 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
15225 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
15226 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_u32(uint32x4_t a)15227 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
15228   return vreinterpretq_f16_u32(a);
15229 }
15230 
15231 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
15232 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
15233 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_u64(uint64x2_t a)15234 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
15235   return vreinterpretq_f16_u64(a);
15236 }
15237 
15238 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
15239 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
15240 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_f32(float32x4_t a)15241 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
15242   return vreinterpretq_f16_f32(a);
15243 }
15244 
15245 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
15246 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
15247 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_p8(poly8x16_t a)15248 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
15249   return vreinterpretq_f16_p8(a);
15250 }
15251 
15252 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
15253 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
15254 // CHECK:   ret <8 x half> [[TMP0]]
test_vreinterpretq_f16_p16(poly16x8_t a)15255 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
15256   return vreinterpretq_f16_p16(a);
15257 }
15258 
15259 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
15260 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
15261 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_s8(int8x16_t a)15262 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
15263   return vreinterpretq_f32_s8(a);
15264 }
15265 
15266 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
15267 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
15268 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_s16(int16x8_t a)15269 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
15270   return vreinterpretq_f32_s16(a);
15271 }
15272 
15273 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
15274 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
15275 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_s32(int32x4_t a)15276 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
15277   return vreinterpretq_f32_s32(a);
15278 }
15279 
15280 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
15281 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
15282 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_s64(int64x2_t a)15283 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
15284   return vreinterpretq_f32_s64(a);
15285 }
15286 
15287 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
15288 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
15289 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_u8(uint8x16_t a)15290 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
15291   return vreinterpretq_f32_u8(a);
15292 }
15293 
15294 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
15295 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
15296 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_u16(uint16x8_t a)15297 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
15298   return vreinterpretq_f32_u16(a);
15299 }
15300 
15301 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
15302 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
15303 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_u32(uint32x4_t a)15304 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
15305   return vreinterpretq_f32_u32(a);
15306 }
15307 
15308 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
15309 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
15310 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_u64(uint64x2_t a)15311 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
15312   return vreinterpretq_f32_u64(a);
15313 }
15314 
15315 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
15316 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
15317 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_f16(float16x8_t a)15318 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
15319   return vreinterpretq_f32_f16(a);
15320 }
15321 
15322 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
15323 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
15324 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_p8(poly8x16_t a)15325 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
15326   return vreinterpretq_f32_p8(a);
15327 }
15328 
15329 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
15330 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
15331 // CHECK:   ret <4 x float> [[TMP0]]
test_vreinterpretq_f32_p16(poly16x8_t a)15332 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
15333   return vreinterpretq_f32_p16(a);
15334 }
15335 
15336 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
15337 // CHECK:   ret <16 x i8> %a
test_vreinterpretq_p8_s8(int8x16_t a)15338 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
15339   return vreinterpretq_p8_s8(a);
15340 }
15341 
15342 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
15343 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15344 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_s16(int16x8_t a)15345 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
15346   return vreinterpretq_p8_s16(a);
15347 }
15348 
15349 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
15350 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15351 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_s32(int32x4_t a)15352 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
15353   return vreinterpretq_p8_s32(a);
15354 }
15355 
15356 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
15357 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15358 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_s64(int64x2_t a)15359 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
15360   return vreinterpretq_p8_s64(a);
15361 }
15362 
15363 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
15364 // CHECK:   ret <16 x i8> %a
test_vreinterpretq_p8_u8(uint8x16_t a)15365 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
15366   return vreinterpretq_p8_u8(a);
15367 }
15368 
15369 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
15370 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15371 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_u16(uint16x8_t a)15372 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
15373   return vreinterpretq_p8_u16(a);
15374 }
15375 
15376 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
15377 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15378 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_u32(uint32x4_t a)15379 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
15380   return vreinterpretq_p8_u32(a);
15381 }
15382 
15383 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
15384 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15385 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_u64(uint64x2_t a)15386 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
15387   return vreinterpretq_p8_u64(a);
15388 }
15389 
15390 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
15391 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
15392 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_f16(float16x8_t a)15393 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
15394   return vreinterpretq_p8_f16(a);
15395 }
15396 
15397 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
15398 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
15399 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_f32(float32x4_t a)15400 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
15401   return vreinterpretq_p8_f32(a);
15402 }
15403 
15404 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
15405 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15406 // CHECK:   ret <16 x i8> [[TMP0]]
test_vreinterpretq_p8_p16(poly16x8_t a)15407 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
15408   return vreinterpretq_p8_p16(a);
15409 }
15410 
15411 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
15412 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15413 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_s8(int8x16_t a)15414 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
15415   return vreinterpretq_p16_s8(a);
15416 }
15417 
15418 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
15419 // CHECK:   ret <8 x i16> %a
test_vreinterpretq_p16_s16(int16x8_t a)15420 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
15421   return vreinterpretq_p16_s16(a);
15422 }
15423 
15424 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
15425 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
15426 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_s32(int32x4_t a)15427 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
15428   return vreinterpretq_p16_s32(a);
15429 }
15430 
15431 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
15432 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
15433 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_s64(int64x2_t a)15434 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
15435   return vreinterpretq_p16_s64(a);
15436 }
15437 
15438 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
15439 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15440 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_u8(uint8x16_t a)15441 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
15442   return vreinterpretq_p16_u8(a);
15443 }
15444 
15445 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
15446 // CHECK:   ret <8 x i16> %a
test_vreinterpretq_p16_u16(uint16x8_t a)15447 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
15448   return vreinterpretq_p16_u16(a);
15449 }
15450 
15451 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
15452 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
15453 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_u32(uint32x4_t a)15454 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
15455   return vreinterpretq_p16_u32(a);
15456 }
15457 
15458 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
15459 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
15460 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_u64(uint64x2_t a)15461 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
15462   return vreinterpretq_p16_u64(a);
15463 }
15464 
15465 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
15466 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
15467 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_f16(float16x8_t a)15468 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
15469   return vreinterpretq_p16_f16(a);
15470 }
15471 
15472 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
15473 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
15474 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_f32(float32x4_t a)15475 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
15476   return vreinterpretq_p16_f32(a);
15477 }
15478 
15479 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
15480 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15481 // CHECK:   ret <8 x i16> [[TMP0]]
test_vreinterpretq_p16_p8(poly8x16_t a)15482 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
15483   return vreinterpretq_p16_p8(a);
15484 }
15485 
15486 
15487 // CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
15488 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15489 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev16_s8(int8x8_t a)15490 int8x8_t test_vrev16_s8(int8x8_t a) {
15491   return vrev16_s8(a);
15492 }
15493 
15494 // CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
15495 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15496 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev16_u8(uint8x8_t a)15497 uint8x8_t test_vrev16_u8(uint8x8_t a) {
15498   return vrev16_u8(a);
15499 }
15500 
15501 // CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
15502 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15503 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev16_p8(poly8x8_t a)15504 poly8x8_t test_vrev16_p8(poly8x8_t a) {
15505   return vrev16_p8(a);
15506 }
15507 
15508 // CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
15509 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
15510 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev16q_s8(int8x16_t a)15511 int8x16_t test_vrev16q_s8(int8x16_t a) {
15512   return vrev16q_s8(a);
15513 }
15514 
15515 // CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
15516 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
15517 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev16q_u8(uint8x16_t a)15518 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
15519   return vrev16q_u8(a);
15520 }
15521 
15522 // CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
15523 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
15524 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev16q_p8(poly8x16_t a)15525 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
15526   return vrev16q_p8(a);
15527 }
15528 
15529 
15530 // CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
15531 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15532 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev32_s8(int8x8_t a)15533 int8x8_t test_vrev32_s8(int8x8_t a) {
15534   return vrev32_s8(a);
15535 }
15536 
15537 // CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
15538 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15539 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vrev32_s16(int16x4_t a)15540 int16x4_t test_vrev32_s16(int16x4_t a) {
15541   return vrev32_s16(a);
15542 }
15543 
15544 // CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
15545 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15546 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev32_u8(uint8x8_t a)15547 uint8x8_t test_vrev32_u8(uint8x8_t a) {
15548   return vrev32_u8(a);
15549 }
15550 
15551 // CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
15552 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15553 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vrev32_u16(uint16x4_t a)15554 uint16x4_t test_vrev32_u16(uint16x4_t a) {
15555   return vrev32_u16(a);
15556 }
15557 
15558 // CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
15559 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15560 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev32_p8(poly8x8_t a)15561 poly8x8_t test_vrev32_p8(poly8x8_t a) {
15562   return vrev32_p8(a);
15563 }
15564 
15565 // CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
15566 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15567 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vrev32_p16(poly16x4_t a)15568 poly16x4_t test_vrev32_p16(poly16x4_t a) {
15569   return vrev32_p16(a);
15570 }
15571 
15572 // CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
15573 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
15574 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev32q_s8(int8x16_t a)15575 int8x16_t test_vrev32q_s8(int8x16_t a) {
15576   return vrev32q_s8(a);
15577 }
15578 
15579 // CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
15580 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15581 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vrev32q_s16(int16x8_t a)15582 int16x8_t test_vrev32q_s16(int16x8_t a) {
15583   return vrev32q_s16(a);
15584 }
15585 
15586 // CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
15587 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
15588 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev32q_u8(uint8x16_t a)15589 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
15590   return vrev32q_u8(a);
15591 }
15592 
15593 // CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
15594 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15595 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vrev32q_u16(uint16x8_t a)15596 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
15597   return vrev32q_u16(a);
15598 }
15599 
15600 // CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
15601 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
15602 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev32q_p8(poly8x16_t a)15603 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
15604   return vrev32q_p8(a);
15605 }
15606 
15607 // CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
15608 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15609 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vrev32q_p16(poly16x8_t a)15610 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
15611   return vrev32q_p16(a);
15612 }
15613 
15614 
15615 // CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
15616 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
15617 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev64_s8(int8x8_t a)15618 int8x8_t test_vrev64_s8(int8x8_t a) {
15619   return vrev64_s8(a);
15620 }
15621 
15622 // CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
15623 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
15624 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vrev64_s16(int16x4_t a)15625 int16x4_t test_vrev64_s16(int16x4_t a) {
15626   return vrev64_s16(a);
15627 }
15628 
15629 // CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
15630 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
15631 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
test_vrev64_s32(int32x2_t a)15632 int32x2_t test_vrev64_s32(int32x2_t a) {
15633   return vrev64_s32(a);
15634 }
15635 
15636 // CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
15637 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
15638 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev64_u8(uint8x8_t a)15639 uint8x8_t test_vrev64_u8(uint8x8_t a) {
15640   return vrev64_u8(a);
15641 }
15642 
15643 // CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
15644 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
15645 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vrev64_u16(uint16x4_t a)15646 uint16x4_t test_vrev64_u16(uint16x4_t a) {
15647   return vrev64_u16(a);
15648 }
15649 
15650 // CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
15651 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
15652 // CHECK:   ret <2 x i32> [[SHUFFLE_I]]
test_vrev64_u32(uint32x2_t a)15653 uint32x2_t test_vrev64_u32(uint32x2_t a) {
15654   return vrev64_u32(a);
15655 }
15656 
15657 // CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
15658 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
15659 // CHECK:   ret <8 x i8> [[SHUFFLE_I]]
test_vrev64_p8(poly8x8_t a)15660 poly8x8_t test_vrev64_p8(poly8x8_t a) {
15661   return vrev64_p8(a);
15662 }
15663 
15664 // CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
15665 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
15666 // CHECK:   ret <4 x i16> [[SHUFFLE_I]]
test_vrev64_p16(poly16x4_t a)15667 poly16x4_t test_vrev64_p16(poly16x4_t a) {
15668   return vrev64_p16(a);
15669 }
15670 
15671 // CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
15672 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
15673 // CHECK:   ret <2 x float> [[SHUFFLE_I]]
test_vrev64_f32(float32x2_t a)15674 float32x2_t test_vrev64_f32(float32x2_t a) {
15675   return vrev64_f32(a);
15676 }
15677 
15678 // CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
15679 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
15680 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev64q_s8(int8x16_t a)15681 int8x16_t test_vrev64q_s8(int8x16_t a) {
15682   return vrev64q_s8(a);
15683 }
15684 
15685 // CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
15686 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15687 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vrev64q_s16(int16x8_t a)15688 int16x8_t test_vrev64q_s16(int16x8_t a) {
15689   return vrev64q_s16(a);
15690 }
15691 
15692 // CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
15693 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15694 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
test_vrev64q_s32(int32x4_t a)15695 int32x4_t test_vrev64q_s32(int32x4_t a) {
15696   return vrev64q_s32(a);
15697 }
15698 
15699 // CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
15700 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
15701 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev64q_u8(uint8x16_t a)15702 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
15703   return vrev64q_u8(a);
15704 }
15705 
15706 // CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
15707 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15708 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vrev64q_u16(uint16x8_t a)15709 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
15710   return vrev64q_u16(a);
15711 }
15712 
15713 // CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
15714 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15715 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
test_vrev64q_u32(uint32x4_t a)15716 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
15717   return vrev64q_u32(a);
15718 }
15719 
15720 // CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
15721 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
15722 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
test_vrev64q_p8(poly8x16_t a)15723 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
15724   return vrev64q_p8(a);
15725 }
15726 
15727 // CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
15728 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15729 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
test_vrev64q_p16(poly16x8_t a)15730 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
15731   return vrev64q_p16(a);
15732 }
15733 
15734 // CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
15735 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15736 // CHECK:   ret <4 x float> [[SHUFFLE_I]]
test_vrev64q_f32(float32x4_t a)15737 float32x4_t test_vrev64q_f32(float32x4_t a) {
15738   return vrev64q_f32(a);
15739 }
15740 
15741 
15742 // CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
15743 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15744 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
test_vrhadd_s8(int8x8_t a,int8x8_t b)15745 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
15746   return vrhadd_s8(a, b);
15747 }
15748 
15749 // CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
15750 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15751 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15752 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15753 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15754 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
15755 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
15756 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
15757 // CHECK:   ret <4 x i16> [[TMP2]]
test_vrhadd_s16(int16x4_t a,int16x4_t b)15758 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
15759   return vrhadd_s16(a, b);
15760 }
15761 
15762 // CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
15763 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15764 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15765 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15766 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15767 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
15768 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
15769 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
15770 // CHECK:   ret <2 x i32> [[TMP2]]
test_vrhadd_s32(int32x2_t a,int32x2_t b)15771 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
15772   return vrhadd_s32(a, b);
15773 }
15774 
15775 // CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
15776 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15777 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
test_vrhadd_u8(uint8x8_t a,uint8x8_t b)15778 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
15779   return vrhadd_u8(a, b);
15780 }
15781 
15782 // CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
15783 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15784 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15785 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15786 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15787 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
15788 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
15789 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
15790 // CHECK:   ret <4 x i16> [[TMP2]]
test_vrhadd_u16(uint16x4_t a,uint16x4_t b)15791 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
15792   return vrhadd_u16(a, b);
15793 }
15794 
15795 // CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
15796 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15797 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15798 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15799 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15800 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
15801 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
15802 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
15803 // CHECK:   ret <2 x i32> [[TMP2]]
test_vrhadd_u32(uint32x2_t a,uint32x2_t b)15804 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
15805   return vrhadd_u32(a, b);
15806 }
15807 
15808 // CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
15809 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
15810 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
test_vrhaddq_s8(int8x16_t a,int8x16_t b)15811 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
15812   return vrhaddq_s8(a, b);
15813 }
15814 
15815 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
15816 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15817 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15818 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15819 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15820 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
15821 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
15822 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
15823 // CHECK:   ret <8 x i16> [[TMP2]]
test_vrhaddq_s16(int16x8_t a,int16x8_t b)15824 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
15825   return vrhaddq_s16(a, b);
15826 }
15827 
15828 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
15829 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15830 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15831 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15832 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15833 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
15834 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
15835 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
15836 // CHECK:   ret <4 x i32> [[TMP2]]
test_vrhaddq_s32(int32x4_t a,int32x4_t b)15837 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
15838   return vrhaddq_s32(a, b);
15839 }
15840 
15841 // CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
15842 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
15843 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
test_vrhaddq_u8(uint8x16_t a,uint8x16_t b)15844 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
15845   return vrhaddq_u8(a, b);
15846 }
15847 
15848 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
15849 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15850 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15851 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15852 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15853 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
15854 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
15855 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
15856 // CHECK:   ret <8 x i16> [[TMP2]]
test_vrhaddq_u16(uint16x8_t a,uint16x8_t b)15857 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
15858   return vrhaddq_u16(a, b);
15859 }
15860 
15861 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
15862 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15863 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15864 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15865 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15866 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
15867 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
15868 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
15869 // CHECK:   ret <4 x i32> [[TMP2]]
test_vrhaddq_u32(uint32x4_t a,uint32x4_t b)15870 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
15871   return vrhaddq_u32(a, b);
15872 }
15873 
15874 
15875 // CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
15876 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15877 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
test_vrshl_s8(int8x8_t a,int8x8_t b)15878 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
15879   return vrshl_s8(a, b);
15880 }
15881 
15882 // CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
15883 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15884 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15885 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15886 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15887 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
15888 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
15889 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
15890 // CHECK:   ret <4 x i16> [[TMP2]]
test_vrshl_s16(int16x4_t a,int16x4_t b)15891 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
15892   return vrshl_s16(a, b);
15893 }
15894 
15895 // CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
15896 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15897 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15898 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15899 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15900 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
15901 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
15902 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
15903 // CHECK:   ret <2 x i32> [[TMP2]]
test_vrshl_s32(int32x2_t a,int32x2_t b)15904 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
15905   return vrshl_s32(a, b);
15906 }
15907 
15908 // CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
15909 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15910 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15911 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15912 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15913 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
15914 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
15915 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
15916 // CHECK:   ret <1 x i64> [[TMP2]]
test_vrshl_s64(int64x1_t a,int64x1_t b)15917 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
15918   return vrshl_s64(a, b);
15919 }
15920 
15921 // CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
15922 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15923 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
test_vrshl_u8(uint8x8_t a,int8x8_t b)15924 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
15925   return vrshl_u8(a, b);
15926 }
15927 
15928 // CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
15929 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15930 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15931 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15932 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15933 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
15934 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
15935 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
15936 // CHECK:   ret <4 x i16> [[TMP2]]
test_vrshl_u16(uint16x4_t a,int16x4_t b)15937 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
15938   return vrshl_u16(a, b);
15939 }
15940 
15941 // CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
15942 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15943 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15944 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15945 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15946 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
15947 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
15948 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
15949 // CHECK:   ret <2 x i32> [[TMP2]]
test_vrshl_u32(uint32x2_t a,int32x2_t b)15950 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
15951   return vrshl_u32(a, b);
15952 }
15953 
15954 // CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
15955 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15956 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15957 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15958 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15959 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
15960 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
15961 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
15962 // CHECK:   ret <1 x i64> [[TMP2]]
test_vrshl_u64(uint64x1_t a,int64x1_t b)15963 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
15964   return vrshl_u64(a, b);
15965 }
15966 
15967 // CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
15968 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
15969 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
test_vrshlq_s8(int8x16_t a,int8x16_t b)15970 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
15971   return vrshlq_s8(a, b);
15972 }
15973 
15974 // CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
15975 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15976 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15977 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15978 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15979 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
15980 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
15981 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
15982 // CHECK:   ret <8 x i16> [[TMP2]]
test_vrshlq_s16(int16x8_t a,int16x8_t b)15983 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
15984   return vrshlq_s16(a, b);
15985 }
15986 
15987 // CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
15988 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15989 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15990 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15991 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15992 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
15993 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
15994 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
15995 // CHECK:   ret <4 x i32> [[TMP2]]
test_vrshlq_s32(int32x4_t a,int32x4_t b)15996 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
15997   return vrshlq_s32(a, b);
15998 }
15999 
16000 // CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16001 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16002 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16003 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16004 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16005 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
16006 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
16007 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
16008 // CHECK:   ret <2 x i64> [[TMP2]]
test_vrshlq_s64(int64x2_t a,int64x2_t b)16009 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
16010   return vrshlq_s64(a, b);
16011 }
16012 
16013 // CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
16014 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
16015 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
test_vrshlq_u8(uint8x16_t a,int8x16_t b)16016 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
16017   return vrshlq_u8(a, b);
16018 }
16019 
16020 // CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16021 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16022 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16023 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16024 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16025 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
16026 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
16027 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
16028 // CHECK:   ret <8 x i16> [[TMP2]]
test_vrshlq_u16(uint16x8_t a,int16x8_t b)16029 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
16030   return vrshlq_u16(a, b);
16031 }
16032 
16033 // CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16034 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16035 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16036 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16037 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16038 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
16039 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
16040 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
16041 // CHECK:   ret <4 x i32> [[TMP2]]
test_vrshlq_u32(uint32x4_t a,int32x4_t b)16042 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
16043   return vrshlq_u32(a, b);
16044 }
16045 
16046 // CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16047 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16048 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16049 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16050 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16051 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
16052 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
16053 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
16054 // CHECK:   ret <2 x i64> [[TMP2]]
test_vrshlq_u64(uint64x2_t a,int64x2_t b)16055 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
16056   return vrshlq_u64(a, b);
16057 }
16058 
16059 
16060 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
16061 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16062 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16063 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16064 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
test_vrshrn_n_s16(int16x8_t a)16065 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
16066   return vrshrn_n_s16(a, 1);
16067 }
16068 
16069 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
16070 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16071 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16072 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16073 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
test_vrshrn_n_s32(int32x4_t a)16074 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
16075   return vrshrn_n_s32(a, 1);
16076 }
16077 
16078 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
16079 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16080 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16081 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
16082 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
test_vrshrn_n_s64(int64x2_t a)16083 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
16084   return vrshrn_n_s64(a, 1);
16085 }
16086 
16087 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
16088 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16089 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16090 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16091 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
test_vrshrn_n_u16(uint16x8_t a)16092 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
16093   return vrshrn_n_u16(a, 1);
16094 }
16095 
16096 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
16097 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16098 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16099 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16100 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
test_vrshrn_n_u32(uint32x4_t a)16101 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
16102   return vrshrn_n_u32(a, 1);
16103 }
16104 
16105 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
16106 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16107 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16108 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
16109 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
test_vrshrn_n_u64(uint64x2_t a)16110 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
16111   return vrshrn_n_u64(a, 1);
16112 }
16113 
16114 
16115 // CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
16116 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16117 // CHECK:   ret <8 x i8> [[VRSHR_N]]
test_vrshr_n_s8(int8x8_t a)16118 int8x8_t test_vrshr_n_s8(int8x8_t a) {
16119   return vrshr_n_s8(a, 1);
16120 }
16121 
16122 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
16123 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16124 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16125 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16126 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
test_vrshr_n_s16(int16x4_t a)16127 int16x4_t test_vrshr_n_s16(int16x4_t a) {
16128   return vrshr_n_s16(a, 1);
16129 }
16130 
16131 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
16132 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16133 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16134 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
16135 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
test_vrshr_n_s32(int32x2_t a)16136 int32x2_t test_vrshr_n_s32(int32x2_t a) {
16137   return vrshr_n_s32(a, 1);
16138 }
16139 
16140 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
16141 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16142 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16143 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
16144 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
test_vrshr_n_s64(int64x1_t a)16145 int64x1_t test_vrshr_n_s64(int64x1_t a) {
16146   return vrshr_n_s64(a, 1);
16147 }
16148 
16149 // CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
16150 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16151 // CHECK:   ret <8 x i8> [[VRSHR_N]]
test_vrshr_n_u8(uint8x8_t a)16152 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
16153   return vrshr_n_u8(a, 1);
16154 }
16155 
16156 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
16157 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16158 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16159 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16160 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
test_vrshr_n_u16(uint16x4_t a)16161 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
16162   return vrshr_n_u16(a, 1);
16163 }
16164 
16165 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
16166 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16167 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16168 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
16169 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
test_vrshr_n_u32(uint32x2_t a)16170 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
16171   return vrshr_n_u32(a, 1);
16172 }
16173 
16174 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
16175 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16176 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16177 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
16178 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
test_vrshr_n_u64(uint64x1_t a)16179 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
16180   return vrshr_n_u64(a, 1);
16181 }
16182 
16183 // CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
16184 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16185 // CHECK:   ret <16 x i8> [[VRSHR_N]]
test_vrshrq_n_s8(int8x16_t a)16186 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
16187   return vrshrq_n_s8(a, 1);
16188 }
16189 
16190 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
16191 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16192 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16193 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16194 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
test_vrshrq_n_s16(int16x8_t a)16195 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
16196   return vrshrq_n_s16(a, 1);
16197 }
16198 
16199 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
16200 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16201 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16202 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16203 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
test_vrshrq_n_s32(int32x4_t a)16204 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
16205   return vrshrq_n_s32(a, 1);
16206 }
16207 
16208 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
16209 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16210 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16211 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
16212 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
test_vrshrq_n_s64(int64x2_t a)16213 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
16214   return vrshrq_n_s64(a, 1);
16215 }
16216 
16217 // CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
16218 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16219 // CHECK:   ret <16 x i8> [[VRSHR_N]]
test_vrshrq_n_u8(uint8x16_t a)16220 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
16221   return vrshrq_n_u8(a, 1);
16222 }
16223 
16224 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
16225 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16226 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16227 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16228 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
test_vrshrq_n_u16(uint16x8_t a)16229 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
16230   return vrshrq_n_u16(a, 1);
16231 }
16232 
16233 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
16234 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16235 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16236 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16237 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
test_vrshrq_n_u32(uint32x4_t a)16238 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
16239   return vrshrq_n_u32(a, 1);
16240 }
16241 
16242 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
16243 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16244 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16245 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
16246 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
test_vrshrq_n_u64(uint64x2_t a)16247 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
16248   return vrshrq_n_u64(a, 1);
16249 }
16250 
16251 
16252 // CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
16253 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
16254 // CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
16255 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #4
16256 // CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
test_vrsqrte_f32(float32x2_t a)16257 float32x2_t test_vrsqrte_f32(float32x2_t a) {
16258   return vrsqrte_f32(a);
16259 }
16260 
16261 // CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
16262 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16263 // CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16264 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
16265 // CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
test_vrsqrte_u32(uint32x2_t a)16266 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
16267   return vrsqrte_u32(a);
16268 }
16269 
16270 // CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
16271 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
16272 // CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
16273 // CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #4
16274 // CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
test_vrsqrteq_f32(float32x4_t a)16275 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
16276   return vrsqrteq_f32(a);
16277 }
16278 
16279 // CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
16280 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16281 // CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16282 // CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
16283 // CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
test_vrsqrteq_u32(uint32x4_t a)16284 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
16285   return vrsqrteq_u32(a);
16286 }
16287 
16288 
16289 // CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %a, <2 x float> %b) #0 {
16290 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
16291 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
16292 // CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
16293 // CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
16294 // CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
16295 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
16296 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
16297 // CHECK:   ret <2 x float> [[TMP2]]
test_vrsqrts_f32(float32x2_t a,float32x2_t b)16298 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
16299   return vrsqrts_f32(a, b);
16300 }
16301 
16302 // CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %a, <4 x float> %b) #0 {
16303 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
16304 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
16305 // CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
16306 // CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
16307 // CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
16308 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
16309 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
16310 // CHECK:   ret <4 x float> [[TMP2]]
test_vrsqrtsq_f32(float32x4_t a,float32x4_t b)16311 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
16312   return vrsqrtsq_f32(a, b);
16313 }
16314 
16315 
16316 // CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
16317 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16318 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
16319 // CHECK:   ret <8 x i8> [[VRSRA_N]]
test_vrsra_n_s8(int8x8_t a,int8x8_t b)16320 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
16321   return vrsra_n_s8(a, b, 1);
16322 }
16323 
16324 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
16325 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16326 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16327 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16328 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16329 // CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16330 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
16331 // CHECK:   ret <4 x i16> [[VRSRA_N]]
test_vrsra_n_s16(int16x4_t a,int16x4_t b)16332 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
16333   return vrsra_n_s16(a, b, 1);
16334 }
16335 
16336 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
16337 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16338 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16339 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16340 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16341 // CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
16342 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
16343 // CHECK:   ret <2 x i32> [[VRSRA_N]]
test_vrsra_n_s32(int32x2_t a,int32x2_t b)16344 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
16345   return vrsra_n_s32(a, b, 1);
16346 }
16347 
16348 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
16349 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16350 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16351 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16352 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16353 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
16354 // CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
16355 // CHECK:   ret <1 x i64> [[VRSRA_N]]
test_vrsra_n_s64(int64x1_t a,int64x1_t b)16356 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
16357   return vrsra_n_s64(a, b, 1);
16358 }
16359 
16360 // CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
16361 // CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16362 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
16363 // CHECK:   ret <8 x i8> [[VRSRA_N]]
test_vrsra_n_u8(uint8x8_t a,uint8x8_t b)16364 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
16365   return vrsra_n_u8(a, b, 1);
16366 }
16367 
16368 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
16369 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16370 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16371 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16372 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16373 // CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16374 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
16375 // CHECK:   ret <4 x i16> [[VRSRA_N]]
test_vrsra_n_u16(uint16x4_t a,uint16x4_t b)16376 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
16377   return vrsra_n_u16(a, b, 1);
16378 }
16379 
16380 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
16381 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16382 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16383 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16384 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16385 // CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
16386 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
16387 // CHECK:   ret <2 x i32> [[VRSRA_N]]
test_vrsra_n_u32(uint32x2_t a,uint32x2_t b)16388 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
16389   return vrsra_n_u32(a, b, 1);
16390 }
16391 
16392 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
16393 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16394 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16395 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16396 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16397 // CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
16398 // CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
16399 // CHECK:   ret <1 x i64> [[VRSRA_N]]
test_vrsra_n_u64(uint64x1_t a,uint64x1_t b)16400 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
16401   return vrsra_n_u64(a, b, 1);
16402 }
16403 
16404 // CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
16405 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16406 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
16407 // CHECK:   ret <16 x i8> [[VRSRA_N]]
test_vrsraq_n_s8(int8x16_t a,int8x16_t b)16408 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
16409   return vrsraq_n_s8(a, b, 1);
16410 }
16411 
16412 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
16413 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16414 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16415 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16416 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16417 // CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16418 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
16419 // CHECK:   ret <8 x i16> [[VRSRA_N]]
test_vrsraq_n_s16(int16x8_t a,int16x8_t b)16420 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
16421   return vrsraq_n_s16(a, b, 1);
16422 }
16423 
16424 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
16425 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16426 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16427 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16428 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16429 // CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16430 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
16431 // CHECK:   ret <4 x i32> [[VRSRA_N]]
test_vrsraq_n_s32(int32x4_t a,int32x4_t b)16432 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
16433   return vrsraq_n_s32(a, b, 1);
16434 }
16435 
16436 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16439 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16440 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16441 // CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
16442 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
16443 // CHECK:   ret <2 x i64> [[VRSRA_N]]
test_vrsraq_n_s64(int64x2_t a,int64x2_t b)16444 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
16445   return vrsraq_n_s64(a, b, 1);
16446 }
16447 
16448 // CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
16449 // CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16450 // CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
16451 // CHECK:   ret <16 x i8> [[VRSRA_N]]
test_vrsraq_n_u8(uint8x16_t a,uint8x16_t b)16452 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
16453   return vrsraq_n_u8(a, b, 1);
16454 }
16455 
16456 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16457 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16458 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16459 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16460 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16461 // CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16462 // CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
16463 // CHECK:   ret <8 x i16> [[VRSRA_N]]
test_vrsraq_n_u16(uint16x8_t a,uint16x8_t b)16464 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
16465   return vrsraq_n_u16(a, b, 1);
16466 }
16467 
16468 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16469 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16470 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16471 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16472 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16473 // CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16474 // CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
16475 // CHECK:   ret <4 x i32> [[VRSRA_N]]
test_vrsraq_n_u32(uint32x4_t a,uint32x4_t b)16476 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
16477   return vrsraq_n_u32(a, b, 1);
16478 }
16479 
16480 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16481 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16482 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16483 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16484 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16485 // CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
16486 // CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
16487 // CHECK:   ret <2 x i64> [[VRSRA_N]]
test_vrsraq_n_u64(uint64x2_t a,uint64x2_t b)16488 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
16489   return vrsraq_n_u64(a, b, 1);
16490 }
16491 
16492 
16493 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
16494 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16495 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16496 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16497 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16498 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
16499 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
test_vrsubhn_s16(int16x8_t a,int16x8_t b)16500 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
16501   return vrsubhn_s16(a, b);
16502 }
16503 
16504 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
16505 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16506 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16507 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16508 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16509 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
16510 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
16511 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
16512 // CHECK:   ret <4 x i16> [[TMP2]]
test_vrsubhn_s32(int32x4_t a,int32x4_t b)16513 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
16514   return vrsubhn_s32(a, b);
16515 }
16516 
16517 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16518 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16519 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16520 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16521 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16522 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
16523 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
16524 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
16525 // CHECK:   ret <2 x i32> [[TMP2]]
test_vrsubhn_s64(int64x2_t a,int64x2_t b)16526 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
16527   return vrsubhn_s64(a, b);
16528 }
16529 
16530 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16531 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16532 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16533 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16534 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16535 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
16536 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
test_vrsubhn_u16(uint16x8_t a,uint16x8_t b)16537 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
16538   return vrsubhn_u16(a, b);
16539 }
16540 
16541 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16542 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16543 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16544 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16545 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16546 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
16547 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
16548 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
16549 // CHECK:   ret <4 x i16> [[TMP2]]
test_vrsubhn_u32(uint32x4_t a,uint32x4_t b)16550 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
16551   return vrsubhn_u32(a, b);
16552 }
16553 
16554 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16555 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16556 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16557 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16558 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16559 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
16560 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
16561 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
16562 // CHECK:   ret <2 x i32> [[TMP2]]
test_vrsubhn_u64(uint64x2_t a,uint64x2_t b)16563 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
16564   return vrsubhn_u64(a, b);
16565 }
16566 
16567 
16568 // CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 zeroext %a, <8 x i8> %b) #0 {
16569 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
16570 // CHECK:   ret <8 x i8> [[VSET_LANE]]
test_vset_lane_u8(uint8_t a,uint8x8_t b)16571 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
16572   return vset_lane_u8(a, b, 7);
16573 }
16574 
16575 // CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 zeroext %a, <4 x i16> %b) #0 {
16576 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16577 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16578 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
16579 // CHECK:   ret <4 x i16> [[VSET_LANE]]
test_vset_lane_u16(uint16_t a,uint16x4_t b)16580 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
16581   return vset_lane_u16(a, b, 3);
16582 }
16583 
16584 // CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
16585 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16586 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16587 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
16588 // CHECK:   ret <2 x i32> [[VSET_LANE]]
test_vset_lane_u32(uint32_t a,uint32x2_t b)16589 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
16590   return vset_lane_u32(a, b, 1);
16591 }
16592 
16593 // CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 signext %a, <8 x i8> %b) #0 {
16594 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
16595 // CHECK:   ret <8 x i8> [[VSET_LANE]]
test_vset_lane_s8(int8_t a,int8x8_t b)16596 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
16597   return vset_lane_s8(a, b, 7);
16598 }
16599 
16600 // CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 signext %a, <4 x i16> %b) #0 {
16601 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16602 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16603 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
16604 // CHECK:   ret <4 x i16> [[VSET_LANE]]
test_vset_lane_s16(int16_t a,int16x4_t b)16605 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
16606   return vset_lane_s16(a, b, 3);
16607 }
16608 
16609 // CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
16610 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16611 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16612 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
16613 // CHECK:   ret <2 x i32> [[VSET_LANE]]
test_vset_lane_s32(int32_t a,int32x2_t b)16614 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
16615   return vset_lane_s32(a, b, 1);
16616 }
16617 
16618 // CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 signext %a, <8 x i8> %b) #0 {
16619 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
16620 // CHECK:   ret <8 x i8> [[VSET_LANE]]
test_vset_lane_p8(poly8_t a,poly8x8_t b)16621 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
16622   return vset_lane_p8(a, b, 7);
16623 }
16624 
16625 // CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 signext %a, <4 x i16> %b) #0 {
16626 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16627 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16628 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
16629 // CHECK:   ret <4 x i16> [[VSET_LANE]]
test_vset_lane_p16(poly16_t a,poly16x4_t b)16630 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
16631   return vset_lane_p16(a, b, 3);
16632 }
16633 
16634 // CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
16635 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
16636 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
16637 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
16638 // CHECK:   ret <2 x float> [[VSET_LANE]]
test_vset_lane_f32(float32_t a,float32x2_t b)16639 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
16640   return vset_lane_f32(a, b, 1);
16641 }
16642 
16643 // CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
16644 // CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
16645 // CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
16646 // CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
16647 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
16648 // CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
16649 // CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
16650 // CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
16651 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
16652 // CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
16653 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
16654 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16655 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16656 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1
16657 // CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
16658 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
16659 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
16660 // CHECK:   ret <4 x half> [[TMP8]]
test_vset_lane_f16(float16_t * a,float16x4_t b)16661 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
16662   return vset_lane_f16(*a, b, 1);
16663 }
16664 
16665 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 zeroext %a, <16 x i8> %b) #0 {
16666 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
16667 // CHECK:   ret <16 x i8> [[VSET_LANE]]
test_vsetq_lane_u8(uint8_t a,uint8x16_t b)16668 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
16669   return vsetq_lane_u8(a, b, 15);
16670 }
16671 
16672 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 zeroext %a, <8 x i16> %b) #0 {
16673 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16674 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16675 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
16676 // CHECK:   ret <8 x i16> [[VSET_LANE]]
test_vsetq_lane_u16(uint16_t a,uint16x8_t b)16677 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
16678   return vsetq_lane_u16(a, b, 7);
16679 }
16680 
16681 // CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
16682 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16683 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16684 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
16685 // CHECK:   ret <4 x i32> [[VSET_LANE]]
test_vsetq_lane_u32(uint32_t a,uint32x4_t b)16686 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
16687   return vsetq_lane_u32(a, b, 3);
16688 }
16689 
16690 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 signext %a, <16 x i8> %b) #0 {
16691 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
16692 // CHECK:   ret <16 x i8> [[VSET_LANE]]
test_vsetq_lane_s8(int8_t a,int8x16_t b)16693 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
16694   return vsetq_lane_s8(a, b, 15);
16695 }
16696 
16697 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 signext %a, <8 x i16> %b) #0 {
16698 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16699 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16700 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
16701 // CHECK:   ret <8 x i16> [[VSET_LANE]]
test_vsetq_lane_s16(int16_t a,int16x8_t b)16702 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
16703   return vsetq_lane_s16(a, b, 7);
16704 }
16705 
16706 // CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
16707 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16708 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16709 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
16710 // CHECK:   ret <4 x i32> [[VSET_LANE]]
test_vsetq_lane_s32(int32_t a,int32x4_t b)16711 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
16712   return vsetq_lane_s32(a, b, 3);
16713 }
16714 
16715 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 signext %a, <16 x i8> %b) #0 {
16716 // CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
16717 // CHECK:   ret <16 x i8> [[VSET_LANE]]
test_vsetq_lane_p8(poly8_t a,poly8x16_t b)16718 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
16719   return vsetq_lane_p8(a, b, 15);
16720 }
16721 
16722 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 signext %a, <8 x i16> %b) #0 {
16723 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16724 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16725 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
16726 // CHECK:   ret <8 x i16> [[VSET_LANE]]
test_vsetq_lane_p16(poly16_t a,poly16x8_t b)16727 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
16728   return vsetq_lane_p16(a, b, 7);
16729 }
16730 
16731 // CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
16732 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
16733 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
16734 // CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
16735 // CHECK:   ret <4 x float> [[VSET_LANE]]
test_vsetq_lane_f32(float32_t a,float32x4_t b)16736 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
16737   return vsetq_lane_f32(a, b, 3);
16738 }
16739 
16740 // CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
16741 // CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
16742 // CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
16743 // CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
16744 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
16745 // CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
16746 // CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
16747 // CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
16748 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
16749 // CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
16750 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
16751 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16752 // CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16753 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
16754 // CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
16755 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
16756 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
16757 // CHECK:   ret <8 x half> [[TMP8]]
test_vsetq_lane_f16(float16_t * a,float16x8_t b)16758 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
16759   return vsetq_lane_f16(*a, b, 3);
16760 }
16761 
16762 // The optimizer is able to get rid of all moves now.
16763 // CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
16764 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16765 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16766 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
16767 // CHECK:   ret <1 x i64> [[VSET_LANE]]
test_vset_lane_s64(int64_t a,int64x1_t b)16768 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
16769   return vset_lane_s64(a, b, 0);
16770 }
16771 
16772 // The optimizer is able to get rid of all moves now.
16773 // CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
16774 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16775 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16776 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
16777 // CHECK:   ret <1 x i64> [[VSET_LANE]]
test_vset_lane_u64(uint64_t a,uint64x1_t b)16778 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
16779   return vset_lane_u64(a, b, 0);
16780 }
16781 
16782 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
16783 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16784 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16785 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
16786 // CHECK:   ret <2 x i64> [[VSET_LANE]]
test_vsetq_lane_s64(int64_t a,int64x2_t b)16787 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
16788   return vsetq_lane_s64(a, b, 1);
16789 }
16790 
16791 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
16792 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16793 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16794 // CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
16795 // CHECK:   ret <2 x i64> [[VSET_LANE]]
test_vsetq_lane_u64(uint64_t a,uint64x2_t b)16796 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
16797   return vsetq_lane_u64(a, b, 1);
16798 }
16799 
16800 
16801 // CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
16802 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
16803 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
test_vshl_s8(int8x8_t a,int8x8_t b)16804 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
16805   return vshl_s8(a, b);
16806 }
16807 
16808 // CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
16809 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16810 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16811 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16812 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16813 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
16814 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
16815 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
16816 // CHECK:   ret <4 x i16> [[TMP2]]
test_vshl_s16(int16x4_t a,int16x4_t b)16817 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
16818   return vshl_s16(a, b);
16819 }
16820 
16821 // CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
16822 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16823 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16824 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16825 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16826 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
16827 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
16828 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
16829 // CHECK:   ret <2 x i32> [[TMP2]]
test_vshl_s32(int32x2_t a,int32x2_t b)16830 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
16831   return vshl_s32(a, b);
16832 }
16833 
16834 // CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
16835 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16836 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16837 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16838 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16839 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
16840 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
16841 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
16842 // CHECK:   ret <1 x i64> [[TMP2]]
test_vshl_s64(int64x1_t a,int64x1_t b)16843 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
16844   return vshl_s64(a, b);
16845 }
16846 
16847 // CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
16848 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
16849 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
test_vshl_u8(uint8x8_t a,int8x8_t b)16850 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
16851   return vshl_u8(a, b);
16852 }
16853 
16854 // CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
16855 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16856 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16857 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16858 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16859 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
16860 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
16861 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
16862 // CHECK:   ret <4 x i16> [[TMP2]]
test_vshl_u16(uint16x4_t a,int16x4_t b)16863 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
16864   return vshl_u16(a, b);
16865 }
16866 
16867 // CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
16868 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16869 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16870 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16871 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16872 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
16873 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
16874 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
16875 // CHECK:   ret <2 x i32> [[TMP2]]
test_vshl_u32(uint32x2_t a,int32x2_t b)16876 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
16877   return vshl_u32(a, b);
16878 }
16879 
16880 // CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
16881 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16882 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16883 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16884 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16885 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
16886 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
16887 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
16888 // CHECK:   ret <1 x i64> [[TMP2]]
test_vshl_u64(uint64x1_t a,int64x1_t b)16889 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
16890   return vshl_u64(a, b);
16891 }
16892 
16893 // CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
16894 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
16895 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
test_vshlq_s8(int8x16_t a,int8x16_t b)16896 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
16897   return vshlq_s8(a, b);
16898 }
16899 
16900 // CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
16901 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16902 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16903 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16904 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16905 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
16906 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
16907 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
16908 // CHECK:   ret <8 x i16> [[TMP2]]
test_vshlq_s16(int16x8_t a,int16x8_t b)16909 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
16910   return vshlq_s16(a, b);
16911 }
16912 
16913 // CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
16914 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16915 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16916 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16917 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16918 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
16919 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
16920 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
16921 // CHECK:   ret <4 x i32> [[TMP2]]
test_vshlq_s32(int32x4_t a,int32x4_t b)16922 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
16923   return vshlq_s32(a, b);
16924 }
16925 
16926 // CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16927 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16928 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16929 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16930 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16931 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
16932 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
16933 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
16934 // CHECK:   ret <2 x i64> [[TMP2]]
test_vshlq_s64(int64x2_t a,int64x2_t b)16935 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
16936   return vshlq_s64(a, b);
16937 }
16938 
16939 // CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
16940 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
16941 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
test_vshlq_u8(uint8x16_t a,int8x16_t b)16942 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
16943   return vshlq_u8(a, b);
16944 }
16945 
16946 // CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16947 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16948 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16949 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16950 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16951 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
16952 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
16953 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
16954 // CHECK:   ret <8 x i16> [[TMP2]]
test_vshlq_u16(uint16x8_t a,int16x8_t b)16955 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
16956   return vshlq_u16(a, b);
16957 }
16958 
16959 // CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16960 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16961 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16962 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16963 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16964 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
16965 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
16966 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
16967 // CHECK:   ret <4 x i32> [[TMP2]]
test_vshlq_u32(uint32x4_t a,int32x4_t b)16968 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
16969   return vshlq_u32(a, b);
16970 }
16971 
16972 // CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16973 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16974 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16975 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16976 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16977 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
16978 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
16979 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
16980 // CHECK:   ret <2 x i64> [[TMP2]]
test_vshlq_u64(uint64x2_t a,int64x2_t b)16981 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
16982   return vshlq_u64(a, b);
16983 }
16984 
16985 
16986 // CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
16987 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
16988 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
16989 // CHECK:   ret <8 x i16> [[VSHLL_N]]
test_vshll_n_s8(int8x8_t a)16990 int16x8_t test_vshll_n_s8(int8x8_t a) {
16991   return vshll_n_s8(a, 1);
16992 }
16993 
16994 // CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
16995 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16996 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16997 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
16998 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
16999 // CHECK:   ret <4 x i32> [[VSHLL_N]]
test_vshll_n_s16(int16x4_t a)17000 int32x4_t test_vshll_n_s16(int16x4_t a) {
17001   return vshll_n_s16(a, 1);
17002 }
17003 
17004 // CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
17005 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17006 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17007 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
17008 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
17009 // CHECK:   ret <2 x i64> [[VSHLL_N]]
test_vshll_n_s32(int32x2_t a)17010 int64x2_t test_vshll_n_s32(int32x2_t a) {
17011   return vshll_n_s32(a, 1);
17012 }
17013 
17014 // CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
17015 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
17016 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17017 // CHECK:   ret <8 x i16> [[VSHLL_N]]
test_vshll_n_u8(uint8x8_t a)17018 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
17019   return vshll_n_u8(a, 1);
17020 }
17021 
17022 // CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
17023 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17024 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17025 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
17026 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
17027 // CHECK:   ret <4 x i32> [[VSHLL_N]]
test_vshll_n_u16(uint16x4_t a)17028 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
17029   return vshll_n_u16(a, 1);
17030 }
17031 
17032 // CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
17033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17034 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17035 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
17036 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
17037 // CHECK:   ret <2 x i64> [[VSHLL_N]]
test_vshll_n_u32(uint32x2_t a)17038 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
17039   return vshll_n_u32(a, 1);
17040 }
17041 
17042 
17043 // CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
17044 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17045 // CHECK:   ret <8 x i8> [[VSHL_N]]
test_vshl_n_s8(int8x8_t a)17046 int8x8_t test_vshl_n_s8(int8x8_t a) {
17047   return vshl_n_s8(a, 1);
17048 }
17049 
17050 // CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
17051 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17052 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17053 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17054 // CHECK:   ret <4 x i16> [[VSHL_N]]
test_vshl_n_s16(int16x4_t a)17055 int16x4_t test_vshl_n_s16(int16x4_t a) {
17056   return vshl_n_s16(a, 1);
17057 }
17058 
17059 // CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
17060 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17061 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17062 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
17063 // CHECK:   ret <2 x i32> [[VSHL_N]]
test_vshl_n_s32(int32x2_t a)17064 int32x2_t test_vshl_n_s32(int32x2_t a) {
17065   return vshl_n_s32(a, 1);
17066 }
17067 
17068 // CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
17069 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17070 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17071 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
17072 // CHECK:   ret <1 x i64> [[VSHL_N]]
test_vshl_n_s64(int64x1_t a)17073 int64x1_t test_vshl_n_s64(int64x1_t a) {
17074   return vshl_n_s64(a, 1);
17075 }
17076 
17077 // CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
17078 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17079 // CHECK:   ret <8 x i8> [[VSHL_N]]
test_vshl_n_u8(uint8x8_t a)17080 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
17081   return vshl_n_u8(a, 1);
17082 }
17083 
17084 // CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
17085 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17086 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17087 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17088 // CHECK:   ret <4 x i16> [[VSHL_N]]
test_vshl_n_u16(uint16x4_t a)17089 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
17090   return vshl_n_u16(a, 1);
17091 }
17092 
17093 // CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
17094 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17095 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17096 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
17097 // CHECK:   ret <2 x i32> [[VSHL_N]]
test_vshl_n_u32(uint32x2_t a)17098 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
17099   return vshl_n_u32(a, 1);
17100 }
17101 
17102 // CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
17103 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17104 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17105 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
17106 // CHECK:   ret <1 x i64> [[VSHL_N]]
test_vshl_n_u64(uint64x1_t a)17107 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
17108   return vshl_n_u64(a, 1);
17109 }
17110 
17111 // CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
17112 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17113 // CHECK:   ret <16 x i8> [[VSHL_N]]
test_vshlq_n_s8(int8x16_t a)17114 int8x16_t test_vshlq_n_s8(int8x16_t a) {
17115   return vshlq_n_s8(a, 1);
17116 }
17117 
17118 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
17119 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17120 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17121 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17122 // CHECK:   ret <8 x i16> [[VSHL_N]]
test_vshlq_n_s16(int16x8_t a)17123 int16x8_t test_vshlq_n_s16(int16x8_t a) {
17124   return vshlq_n_s16(a, 1);
17125 }
17126 
17127 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
17128 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17129 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17130 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17131 // CHECK:   ret <4 x i32> [[VSHL_N]]
test_vshlq_n_s32(int32x4_t a)17132 int32x4_t test_vshlq_n_s32(int32x4_t a) {
17133   return vshlq_n_s32(a, 1);
17134 }
17135 
17136 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
17137 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17138 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17139 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
17140 // CHECK:   ret <2 x i64> [[VSHL_N]]
test_vshlq_n_s64(int64x2_t a)17141 int64x2_t test_vshlq_n_s64(int64x2_t a) {
17142   return vshlq_n_s64(a, 1);
17143 }
17144 
17145 // CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
17146 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17147 // CHECK:   ret <16 x i8> [[VSHL_N]]
test_vshlq_n_u8(uint8x16_t a)17148 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
17149   return vshlq_n_u8(a, 1);
17150 }
17151 
17152 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
17153 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17154 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17155 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17156 // CHECK:   ret <8 x i16> [[VSHL_N]]
test_vshlq_n_u16(uint16x8_t a)17157 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
17158   return vshlq_n_u16(a, 1);
17159 }
17160 
17161 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
17162 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17163 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17164 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17165 // CHECK:   ret <4 x i32> [[VSHL_N]]
test_vshlq_n_u32(uint32x4_t a)17166 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
17167   return vshlq_n_u32(a, 1);
17168 }
17169 
17170 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
17171 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17172 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17173 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
17174 // CHECK:   ret <2 x i64> [[VSHL_N]]
test_vshlq_n_u64(uint64x2_t a)17175 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
17176   return vshlq_n_u64(a, 1);
17177 }
17178 
17179 
17180 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
17181 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17182 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17183 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17184 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
17185 // CHECK:   ret <8 x i8> [[VSHRN_N]]
test_vshrn_n_s16(int16x8_t a)17186 int8x8_t test_vshrn_n_s16(int16x8_t a) {
17187   return vshrn_n_s16(a, 1);
17188 }
17189 
17190 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
17191 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17192 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17193 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17194 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
17195 // CHECK:   ret <4 x i16> [[VSHRN_N]]
test_vshrn_n_s32(int32x4_t a)17196 int16x4_t test_vshrn_n_s32(int32x4_t a) {
17197   return vshrn_n_s32(a, 1);
17198 }
17199 
17200 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
17201 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17202 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17203 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
17204 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
17205 // CHECK:   ret <2 x i32> [[VSHRN_N]]
test_vshrn_n_s64(int64x2_t a)17206 int32x2_t test_vshrn_n_s64(int64x2_t a) {
17207   return vshrn_n_s64(a, 1);
17208 }
17209 
17210 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
17211 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17212 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17213 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17214 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
17215 // CHECK:   ret <8 x i8> [[VSHRN_N]]
test_vshrn_n_u16(uint16x8_t a)17216 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
17217   return vshrn_n_u16(a, 1);
17218 }
17219 
17220 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
17221 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17222 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17223 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17224 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
17225 // CHECK:   ret <4 x i16> [[VSHRN_N]]
test_vshrn_n_u32(uint32x4_t a)17226 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
17227   return vshrn_n_u32(a, 1);
17228 }
17229 
17230 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
17231 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17232 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17233 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
17234 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
17235 // CHECK:   ret <2 x i32> [[VSHRN_N]]
test_vshrn_n_u64(uint64x2_t a)17236 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
17237   return vshrn_n_u64(a, 1);
17238 }
17239 
17240 
17241 // CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
17242 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17243 // CHECK:   ret <8 x i8> [[VSHR_N]]
test_vshr_n_s8(int8x8_t a)17244 int8x8_t test_vshr_n_s8(int8x8_t a) {
17245   return vshr_n_s8(a, 1);
17246 }
17247 
17248 // CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
17249 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17250 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17251 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17252 // CHECK:   ret <4 x i16> [[VSHR_N]]
test_vshr_n_s16(int16x4_t a)17253 int16x4_t test_vshr_n_s16(int16x4_t a) {
17254   return vshr_n_s16(a, 1);
17255 }
17256 
17257 // CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
17258 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17259 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17260 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
17261 // CHECK:   ret <2 x i32> [[VSHR_N]]
test_vshr_n_s32(int32x2_t a)17262 int32x2_t test_vshr_n_s32(int32x2_t a) {
17263   return vshr_n_s32(a, 1);
17264 }
17265 
17266 // CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
17267 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17268 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17269 // CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
17270 // CHECK:   ret <1 x i64> [[VSHR_N]]
test_vshr_n_s64(int64x1_t a)17271 int64x1_t test_vshr_n_s64(int64x1_t a) {
17272   return vshr_n_s64(a, 1);
17273 }
17274 
17275 // CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
17276 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17277 // CHECK:   ret <8 x i8> [[VSHR_N]]
test_vshr_n_u8(uint8x8_t a)17278 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
17279   return vshr_n_u8(a, 1);
17280 }
17281 
17282 // CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
17283 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17284 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17285 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17286 // CHECK:   ret <4 x i16> [[VSHR_N]]
test_vshr_n_u16(uint16x4_t a)17287 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
17288   return vshr_n_u16(a, 1);
17289 }
17290 
17291 // CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
17292 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17293 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17294 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
17295 // CHECK:   ret <2 x i32> [[VSHR_N]]
test_vshr_n_u32(uint32x2_t a)17296 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
17297   return vshr_n_u32(a, 1);
17298 }
17299 
17300 // CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
17301 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17302 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17303 // CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
17304 // CHECK:   ret <1 x i64> [[VSHR_N]]
test_vshr_n_u64(uint64x1_t a)17305 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
17306   return vshr_n_u64(a, 1);
17307 }
17308 
17309 // CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
17310 // CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17311 // CHECK:   ret <16 x i8> [[VSHR_N]]
test_vshrq_n_s8(int8x16_t a)17312 int8x16_t test_vshrq_n_s8(int8x16_t a) {
17313   return vshrq_n_s8(a, 1);
17314 }
17315 
17316 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
17317 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17318 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17319 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17320 // CHECK:   ret <8 x i16> [[VSHR_N]]
test_vshrq_n_s16(int16x8_t a)17321 int16x8_t test_vshrq_n_s16(int16x8_t a) {
17322   return vshrq_n_s16(a, 1);
17323 }
17324 
17325 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
17326 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17327 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17328 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17329 // CHECK:   ret <4 x i32> [[VSHR_N]]
test_vshrq_n_s32(int32x4_t a)17330 int32x4_t test_vshrq_n_s32(int32x4_t a) {
17331   return vshrq_n_s32(a, 1);
17332 }
17333 
17334 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
17335 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17336 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17337 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
17338 // CHECK:   ret <2 x i64> [[VSHR_N]]
test_vshrq_n_s64(int64x2_t a)17339 int64x2_t test_vshrq_n_s64(int64x2_t a) {
17340   return vshrq_n_s64(a, 1);
17341 }
17342 
17343 // CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
17344 // CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17345 // CHECK:   ret <16 x i8> [[VSHR_N]]
test_vshrq_n_u8(uint8x16_t a)17346 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
17347   return vshrq_n_u8(a, 1);
17348 }
17349 
17350 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
17351 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17352 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17353 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17354 // CHECK:   ret <8 x i16> [[VSHR_N]]
test_vshrq_n_u16(uint16x8_t a)17355 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
17356   return vshrq_n_u16(a, 1);
17357 }
17358 
17359 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
17360 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17361 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17362 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17363 // CHECK:   ret <4 x i32> [[VSHR_N]]
test_vshrq_n_u32(uint32x4_t a)17364 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
17365   return vshrq_n_u32(a, 1);
17366 }
17367 
17368 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
17369 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17370 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17371 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
17372 // CHECK:   ret <2 x i64> [[VSHR_N]]
test_vshrq_n_u64(uint64x2_t a)17373 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
17374   return vshrq_n_u64(a, 1);
17375 }
17376 
17377 
17378 // CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
17379 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17380 // CHECK:   ret <8 x i8> [[VSLI_N]]
test_vsli_n_s8(int8x8_t a,int8x8_t b)17381 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
17382   return vsli_n_s8(a, b, 1);
17383 }
17384 
17385 // CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
17386 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17387 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17388 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17389 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17390 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
17391 // CHECK:   ret <4 x i16> [[VSLI_N2]]
test_vsli_n_s16(int16x4_t a,int16x4_t b)17392 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
17393   return vsli_n_s16(a, b, 1);
17394 }
17395 
17396 // CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
17397 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17398 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17399 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17400 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17401 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
17402 // CHECK:   ret <2 x i32> [[VSLI_N2]]
test_vsli_n_s32(int32x2_t a,int32x2_t b)17403 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
17404   return vsli_n_s32(a, b, 1);
17405 }
17406 
17407 // CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
17408 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17409 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17410 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17411 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17412 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
17413 // CHECK:   ret <1 x i64> [[VSLI_N2]]
test_vsli_n_s64(int64x1_t a,int64x1_t b)17414 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
17415   return vsli_n_s64(a, b, 1);
17416 }
17417 
17418 // CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
17419 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17420 // CHECK:   ret <8 x i8> [[VSLI_N]]
test_vsli_n_u8(uint8x8_t a,uint8x8_t b)17421 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
17422   return vsli_n_u8(a, b, 1);
17423 }
17424 
17425 // CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
17426 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17427 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17428 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17429 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17430 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
17431 // CHECK:   ret <4 x i16> [[VSLI_N2]]
test_vsli_n_u16(uint16x4_t a,uint16x4_t b)17432 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
17433   return vsli_n_u16(a, b, 1);
17434 }
17435 
17436 // CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
17437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17439 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17440 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17441 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
17442 // CHECK:   ret <2 x i32> [[VSLI_N2]]
test_vsli_n_u32(uint32x2_t a,uint32x2_t b)17443 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
17444   return vsli_n_u32(a, b, 1);
17445 }
17446 
17447 // CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
17448 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17449 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17450 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17451 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17452 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
17453 // CHECK:   ret <1 x i64> [[VSLI_N2]]
test_vsli_n_u64(uint64x1_t a,uint64x1_t b)17454 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
17455   return vsli_n_u64(a, b, 1);
17456 }
17457 
17458 // CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
17459 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17460 // CHECK:   ret <8 x i8> [[VSLI_N]]
test_vsli_n_p8(poly8x8_t a,poly8x8_t b)17461 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
17462   return vsli_n_p8(a, b, 1);
17463 }
17464 
17465 // CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
17466 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17467 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17468 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17469 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17470 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
17471 // CHECK:   ret <4 x i16> [[VSLI_N2]]
test_vsli_n_p16(poly16x4_t a,poly16x4_t b)17472 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
17473   return vsli_n_p16(a, b, 1);
17474 }
17475 
17476 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
17477 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17478 // CHECK:   ret <16 x i8> [[VSLI_N]]
test_vsliq_n_s8(int8x16_t a,int8x16_t b)17479 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
17480   return vsliq_n_s8(a, b, 1);
17481 }
17482 
17483 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
17484 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17485 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17486 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17487 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17488 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
17489 // CHECK:   ret <8 x i16> [[VSLI_N2]]
test_vsliq_n_s16(int16x8_t a,int16x8_t b)17490 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
17491   return vsliq_n_s16(a, b, 1);
17492 }
17493 
17494 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
17495 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17496 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17497 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17498 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17499 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
17500 // CHECK:   ret <4 x i32> [[VSLI_N2]]
test_vsliq_n_s32(int32x4_t a,int32x4_t b)17501 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
17502   return vsliq_n_s32(a, b, 1);
17503 }
17504 
17505 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
17506 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17507 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17508 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17509 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17510 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
17511 // CHECK:   ret <2 x i64> [[VSLI_N2]]
test_vsliq_n_s64(int64x2_t a,int64x2_t b)17512 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
17513   return vsliq_n_s64(a, b, 1);
17514 }
17515 
17516 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
17517 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17518 // CHECK:   ret <16 x i8> [[VSLI_N]]
test_vsliq_n_u8(uint8x16_t a,uint8x16_t b)17519 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
17520   return vsliq_n_u8(a, b, 1);
17521 }
17522 
17523 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
17524 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17525 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17526 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17527 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17528 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
17529 // CHECK:   ret <8 x i16> [[VSLI_N2]]
test_vsliq_n_u16(uint16x8_t a,uint16x8_t b)17530 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
17531   return vsliq_n_u16(a, b, 1);
17532 }
17533 
17534 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
17535 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17536 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17537 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17538 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17539 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
17540 // CHECK:   ret <4 x i32> [[VSLI_N2]]
test_vsliq_n_u32(uint32x4_t a,uint32x4_t b)17541 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
17542   return vsliq_n_u32(a, b, 1);
17543 }
17544 
17545 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
17546 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17547 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17548 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17549 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17550 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
17551 // CHECK:   ret <2 x i64> [[VSLI_N2]]
test_vsliq_n_u64(uint64x2_t a,uint64x2_t b)17552 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
17553   return vsliq_n_u64(a, b, 1);
17554 }
17555 
17556 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
17557 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17558 // CHECK:   ret <16 x i8> [[VSLI_N]]
test_vsliq_n_p8(poly8x16_t a,poly8x16_t b)17559 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
17560   return vsliq_n_p8(a, b, 1);
17561 }
17562 
17563 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
17564 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17565 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17566 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17567 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17568 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
17569 // CHECK:   ret <8 x i16> [[VSLI_N2]]
test_vsliq_n_p16(poly16x8_t a,poly16x8_t b)17570 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
17571   return vsliq_n_p16(a, b, 1);
17572 }
17573 
17574 
17575 // CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
17576 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17577 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
17578 // CHECK:   ret <8 x i8> [[TMP0]]
test_vsra_n_s8(int8x8_t a,int8x8_t b)17579 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
17580   return vsra_n_s8(a, b, 1);
17581 }
17582 
17583 // CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
17584 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17585 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17586 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17587 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17588 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
17589 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
17590 // CHECK:   ret <4 x i16> [[TMP4]]
test_vsra_n_s16(int16x4_t a,int16x4_t b)17591 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
17592   return vsra_n_s16(a, b, 1);
17593 }
17594 
17595 // CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
17596 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17597 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17598 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17599 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17600 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
17601 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
17602 // CHECK:   ret <2 x i32> [[TMP4]]
test_vsra_n_s32(int32x2_t a,int32x2_t b)17603 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
17604   return vsra_n_s32(a, b, 1);
17605 }
17606 
17607 // CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
17608 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17609 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17610 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17611 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17612 // CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
17613 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
17614 // CHECK:   ret <1 x i64> [[TMP4]]
test_vsra_n_s64(int64x1_t a,int64x1_t b)17615 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
17616   return vsra_n_s64(a, b, 1);
17617 }
17618 
17619 // CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
17620 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17621 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
17622 // CHECK:   ret <8 x i8> [[TMP0]]
test_vsra_n_u8(uint8x8_t a,uint8x8_t b)17623 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
17624   return vsra_n_u8(a, b, 1);
17625 }
17626 
17627 // CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
17628 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17629 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17630 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17631 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17632 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
17633 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
17634 // CHECK:   ret <4 x i16> [[TMP4]]
test_vsra_n_u16(uint16x4_t a,uint16x4_t b)17635 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
17636   return vsra_n_u16(a, b, 1);
17637 }
17638 
17639 // CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
17640 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17641 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17642 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17643 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17644 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
17645 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
17646 // CHECK:   ret <2 x i32> [[TMP4]]
test_vsra_n_u32(uint32x2_t a,uint32x2_t b)17647 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
17648   return vsra_n_u32(a, b, 1);
17649 }
17650 
17651 // CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
17652 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17653 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17654 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17655 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17656 // CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
17657 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
17658 // CHECK:   ret <1 x i64> [[TMP4]]
test_vsra_n_u64(uint64x1_t a,uint64x1_t b)17659 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
17660   return vsra_n_u64(a, b, 1);
17661 }
17662 
17663 // CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
17664 // CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17665 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
17666 // CHECK:   ret <16 x i8> [[TMP0]]
test_vsraq_n_s8(int8x16_t a,int8x16_t b)17667 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
17668   return vsraq_n_s8(a, b, 1);
17669 }
17670 
17671 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
17672 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17673 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17674 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17675 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17676 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17677 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
17678 // CHECK:   ret <8 x i16> [[TMP4]]
test_vsraq_n_s16(int16x8_t a,int16x8_t b)17679 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
17680   return vsraq_n_s16(a, b, 1);
17681 }
17682 
17683 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
17684 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17685 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17686 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17687 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17688 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
17689 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
17690 // CHECK:   ret <4 x i32> [[TMP4]]
test_vsraq_n_s32(int32x4_t a,int32x4_t b)17691 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
17692   return vsraq_n_s32(a, b, 1);
17693 }
17694 
17695 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
17696 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17697 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17698 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17699 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17700 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
17701 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
17702 // CHECK:   ret <2 x i64> [[TMP4]]
test_vsraq_n_s64(int64x2_t a,int64x2_t b)17703 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
17704   return vsraq_n_s64(a, b, 1);
17705 }
17706 
17707 // CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
17708 // CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17709 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
17710 // CHECK:   ret <16 x i8> [[TMP0]]
test_vsraq_n_u8(uint8x16_t a,uint8x16_t b)17711 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
17712   return vsraq_n_u8(a, b, 1);
17713 }
17714 
17715 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
17716 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17717 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17718 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17719 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17720 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17721 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
17722 // CHECK:   ret <8 x i16> [[TMP4]]
test_vsraq_n_u16(uint16x8_t a,uint16x8_t b)17723 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
17724   return vsraq_n_u16(a, b, 1);
17725 }
17726 
17727 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
17728 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17729 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17730 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17731 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17732 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
17733 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
17734 // CHECK:   ret <4 x i32> [[TMP4]]
test_vsraq_n_u32(uint32x4_t a,uint32x4_t b)17735 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
17736   return vsraq_n_u32(a, b, 1);
17737 }
17738 
17739 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
17740 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17741 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17742 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17743 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17744 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
17745 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
17746 // CHECK:   ret <2 x i64> [[TMP4]]
test_vsraq_n_u64(uint64x2_t a,uint64x2_t b)17747 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
17748   return vsraq_n_u64(a, b, 1);
17749 }
17750 
17751 
17752 // CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
17753 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17754 // CHECK:   ret <8 x i8> [[VSLI_N]]
test_vsri_n_s8(int8x8_t a,int8x8_t b)17755 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
17756   return vsri_n_s8(a, b, 1);
17757 }
17758 
17759 // CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
17760 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17761 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17762 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17763 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17764 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
17765 // CHECK:   ret <4 x i16> [[VSLI_N2]]
test_vsri_n_s16(int16x4_t a,int16x4_t b)17766 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
17767   return vsri_n_s16(a, b, 1);
17768 }
17769 
17770 // CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
17771 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17772 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17773 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17774 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17775 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
17776 // CHECK:   ret <2 x i32> [[VSLI_N2]]
test_vsri_n_s32(int32x2_t a,int32x2_t b)17777 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
17778   return vsri_n_s32(a, b, 1);
17779 }
17780 
17781 // CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
17782 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17783 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17784 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17785 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17786 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
17787 // CHECK:   ret <1 x i64> [[VSLI_N2]]
test_vsri_n_s64(int64x1_t a,int64x1_t b)17788 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
17789   return vsri_n_s64(a, b, 1);
17790 }
17791 
17792 // CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
17793 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17794 // CHECK:   ret <8 x i8> [[VSLI_N]]
test_vsri_n_u8(uint8x8_t a,uint8x8_t b)17795 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
17796   return vsri_n_u8(a, b, 1);
17797 }
17798 
17799 // CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
17800 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17801 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17802 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17803 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17804 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
17805 // CHECK:   ret <4 x i16> [[VSLI_N2]]
test_vsri_n_u16(uint16x4_t a,uint16x4_t b)17806 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
17807   return vsri_n_u16(a, b, 1);
17808 }
17809 
17810 // CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
17811 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17812 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17813 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17814 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17815 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
17816 // CHECK:   ret <2 x i32> [[VSLI_N2]]
test_vsri_n_u32(uint32x2_t a,uint32x2_t b)17817 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
17818   return vsri_n_u32(a, b, 1);
17819 }
17820 
17821 // CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
17822 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17823 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17824 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17825 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17826 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
17827 // CHECK:   ret <1 x i64> [[VSLI_N2]]
test_vsri_n_u64(uint64x1_t a,uint64x1_t b)17828 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
17829   return vsri_n_u64(a, b, 1);
17830 }
17831 
17832 // CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
17833 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17834 // CHECK:   ret <8 x i8> [[VSLI_N]]
test_vsri_n_p8(poly8x8_t a,poly8x8_t b)17835 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
17836   return vsri_n_p8(a, b, 1);
17837 }
17838 
17839 // CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
17840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17841 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17842 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17843 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17844 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
17845 // CHECK:   ret <4 x i16> [[VSLI_N2]]
test_vsri_n_p16(poly16x4_t a,poly16x4_t b)17846 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
17847   return vsri_n_p16(a, b, 1);
17848 }
17849 
17850 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
17851 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17852 // CHECK:   ret <16 x i8> [[VSLI_N]]
test_vsriq_n_s8(int8x16_t a,int8x16_t b)17853 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
17854   return vsriq_n_s8(a, b, 1);
17855 }
17856 
17857 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
17858 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17859 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17860 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17861 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17862 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
17863 // CHECK:   ret <8 x i16> [[VSLI_N2]]
test_vsriq_n_s16(int16x8_t a,int16x8_t b)17864 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
17865   return vsriq_n_s16(a, b, 1);
17866 }
17867 
17868 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
17869 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17870 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17871 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17872 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17873 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
17874 // CHECK:   ret <4 x i32> [[VSLI_N2]]
test_vsriq_n_s32(int32x4_t a,int32x4_t b)17875 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
17876   return vsriq_n_s32(a, b, 1);
17877 }
17878 
17879 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
17880 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17881 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17882 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17883 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17884 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
17885 // CHECK:   ret <2 x i64> [[VSLI_N2]]
test_vsriq_n_s64(int64x2_t a,int64x2_t b)17886 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
17887   return vsriq_n_s64(a, b, 1);
17888 }
17889 
17890 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
17891 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17892 // CHECK:   ret <16 x i8> [[VSLI_N]]
test_vsriq_n_u8(uint8x16_t a,uint8x16_t b)17893 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
17894   return vsriq_n_u8(a, b, 1);
17895 }
17896 
17897 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
17898 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17899 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17900 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17901 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17902 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
17903 // CHECK:   ret <8 x i16> [[VSLI_N2]]
test_vsriq_n_u16(uint16x8_t a,uint16x8_t b)17904 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
17905   return vsriq_n_u16(a, b, 1);
17906 }
17907 
17908 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
17909 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17910 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17911 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17912 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17913 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
17914 // CHECK:   ret <4 x i32> [[VSLI_N2]]
test_vsriq_n_u32(uint32x4_t a,uint32x4_t b)17915 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
17916   return vsriq_n_u32(a, b, 1);
17917 }
17918 
17919 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
17920 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17921 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17922 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17923 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17924 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
17925 // CHECK:   ret <2 x i64> [[VSLI_N2]]
test_vsriq_n_u64(uint64x2_t a,uint64x2_t b)17926 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
17927   return vsriq_n_u64(a, b, 1);
17928 }
17929 
17930 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
17931 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17932 // CHECK:   ret <16 x i8> [[VSLI_N]]
test_vsriq_n_p8(poly8x16_t a,poly8x16_t b)17933 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
17934   return vsriq_n_p8(a, b, 1);
17935 }
17936 
17937 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
17938 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17939 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17940 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17941 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17942 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
17943 // CHECK:   ret <8 x i16> [[VSLI_N2]]
test_vsriq_n_p16(poly16x8_t a,poly16x8_t b)17944 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
17945   return vsriq_n_p16(a, b, 1);
17946 }
17947 
17948 
17949 // CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
17950 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
17951 // CHECK:   ret void
test_vst1q_u8(uint8_t * a,uint8x16_t b)17952 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
17953   vst1q_u8(a, b);
17954 }
17955 
17956 // CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
17957 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
17958 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17959 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17960 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
17961 // CHECK:   ret void
test_vst1q_u16(uint16_t * a,uint16x8_t b)17962 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
17963   vst1q_u16(a, b);
17964 }
17965 
17966 // CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
17967 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
17968 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17969 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17970 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
17971 // CHECK:   ret void
test_vst1q_u32(uint32_t * a,uint32x4_t b)17972 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
17973   vst1q_u32(a, b);
17974 }
17975 
17976 // CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
17977 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
17978 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17979 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17980 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
17981 // CHECK:   ret void
test_vst1q_u64(uint64_t * a,uint64x2_t b)17982 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
17983   vst1q_u64(a, b);
17984 }
17985 
17986 // CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
17987 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
17988 // CHECK:   ret void
test_vst1q_s8(int8_t * a,int8x16_t b)17989 void test_vst1q_s8(int8_t * a, int8x16_t b) {
17990   vst1q_s8(a, b);
17991 }
17992 
17993 // CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
17994 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
17995 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17996 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17997 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
17998 // CHECK:   ret void
test_vst1q_s16(int16_t * a,int16x8_t b)17999 void test_vst1q_s16(int16_t * a, int16x8_t b) {
18000   vst1q_s16(a, b);
18001 }
18002 
18003 // CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
18004 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18005 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18006 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
18007 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
18008 // CHECK:   ret void
test_vst1q_s32(int32_t * a,int32x4_t b)18009 void test_vst1q_s32(int32_t * a, int32x4_t b) {
18010   vst1q_s32(a, b);
18011 }
18012 
18013 // CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
18014 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18015 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18016 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
18017 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
18018 // CHECK:   ret void
test_vst1q_s64(int64_t * a,int64x2_t b)18019 void test_vst1q_s64(int64_t * a, int64x2_t b) {
18020   vst1q_s64(a, b);
18021 }
18022 
18023 // CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
18024 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18025 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
18026 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18027 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
18028 // CHECK:   ret void
test_vst1q_f16(float16_t * a,float16x8_t b)18029 void test_vst1q_f16(float16_t * a, float16x8_t b) {
18030   vst1q_f16(a, b);
18031 }
18032 
18033 // CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
18034 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18035 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
18036 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
18037 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* [[TMP0]], <4 x float> [[TMP2]], i32 4)
18038 // CHECK:   ret void
test_vst1q_f32(float32_t * a,float32x4_t b)18039 void test_vst1q_f32(float32_t * a, float32x4_t b) {
18040   vst1q_f32(a, b);
18041 }
18042 
18043 // CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
18044 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
18045 // CHECK:   ret void
test_vst1q_p8(poly8_t * a,poly8x16_t b)18046 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
18047   vst1q_p8(a, b);
18048 }
18049 
18050 // CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
18051 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18052 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18053 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18054 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
18055 // CHECK:   ret void
test_vst1q_p16(poly16_t * a,poly16x8_t b)18056 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
18057   vst1q_p16(a, b);
18058 }
18059 
18060 // CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
18061 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
18062 // CHECK:   ret void
test_vst1_u8(uint8_t * a,uint8x8_t b)18063 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
18064   vst1_u8(a, b);
18065 }
18066 
18067 // CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
18068 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18069 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18070 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18071 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18072 // CHECK:   ret void
test_vst1_u16(uint16_t * a,uint16x4_t b)18073 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
18074   vst1_u16(a, b);
18075 }
18076 
18077 // CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
18078 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18079 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18080 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18081 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
18082 // CHECK:   ret void
test_vst1_u32(uint32_t * a,uint32x2_t b)18083 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
18084   vst1_u32(a, b);
18085 }
18086 
18087 // CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
18088 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18089 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18090 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18091 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
18092 // CHECK:   ret void
test_vst1_u64(uint64_t * a,uint64x1_t b)18093 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
18094   vst1_u64(a, b);
18095 }
18096 
18097 // CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
18098 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
18099 // CHECK:   ret void
test_vst1_s8(int8_t * a,int8x8_t b)18100 void test_vst1_s8(int8_t * a, int8x8_t b) {
18101   vst1_s8(a, b);
18102 }
18103 
18104 // CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
18105 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18106 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18107 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18108 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18109 // CHECK:   ret void
test_vst1_s16(int16_t * a,int16x4_t b)18110 void test_vst1_s16(int16_t * a, int16x4_t b) {
18111   vst1_s16(a, b);
18112 }
18113 
18114 // CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
18115 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18116 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18117 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18118 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
18119 // CHECK:   ret void
test_vst1_s32(int32_t * a,int32x2_t b)18120 void test_vst1_s32(int32_t * a, int32x2_t b) {
18121   vst1_s32(a, b);
18122 }
18123 
18124 // CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
18125 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18126 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18127 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18128 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
18129 // CHECK:   ret void
test_vst1_s64(int64_t * a,int64x1_t b)18130 void test_vst1_s64(int64_t * a, int64x1_t b) {
18131   vst1_s64(a, b);
18132 }
18133 
18134 // CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
18135 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18136 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
18137 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18138 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18139 // CHECK:   ret void
test_vst1_f16(float16_t * a,float16x4_t b)18140 void test_vst1_f16(float16_t * a, float16x4_t b) {
18141   vst1_f16(a, b);
18142 }
18143 
18144 // CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
18145 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
18147 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
18148 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* [[TMP0]], <2 x float> [[TMP2]], i32 4)
18149 // CHECK:   ret void
test_vst1_f32(float32_t * a,float32x2_t b)18150 void test_vst1_f32(float32_t * a, float32x2_t b) {
18151   vst1_f32(a, b);
18152 }
18153 
18154 // CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
18155 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
18156 // CHECK:   ret void
test_vst1_p8(poly8_t * a,poly8x8_t b)18157 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
18158   vst1_p8(a, b);
18159 }
18160 
18161 // CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
18162 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18163 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18164 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18165 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18166 // CHECK:   ret void
test_vst1_p16(poly16_t * a,poly16x4_t b)18167 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
18168   vst1_p16(a, b);
18169 }
18170 
18171 
18172 // CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
18173 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
18174 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
18175 // CHECK:   ret void
test_vst1q_lane_u8(uint8_t * a,uint8x16_t b)18176 void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
18177   vst1q_lane_u8(a, b, 15);
18178 }
18179 
18180 // CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
18181 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18182 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18183 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18184 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18185 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18186 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18187 // CHECK:   ret void
test_vst1q_lane_u16(uint16_t * a,uint16x8_t b)18188 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
18189   vst1q_lane_u16(a, b, 7);
18190 }
18191 
18192 // CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
18193 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18194 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18195 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
18196 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
18197 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18198 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18199 // CHECK:   ret void
test_vst1q_lane_u32(uint32_t * a,uint32x4_t b)18200 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
18201   vst1q_lane_u32(a, b, 3);
18202 }
18203 
18204 // CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
18205 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18206 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18207 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
18208 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
18209 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
18210 // CHECK:   ret void
test_vst1q_lane_u64(uint64_t * a,uint64x2_t b)18211 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
18212   vst1q_lane_u64(a, b, 1);
18213 }
18214 
18215 // CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
18216 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
18217 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
18218 // CHECK:   ret void
test_vst1q_lane_s8(int8_t * a,int8x16_t b)18219 void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
18220   vst1q_lane_s8(a, b, 15);
18221 }
18222 
18223 // CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
18224 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18225 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18226 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18227 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18228 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18229 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18230 // CHECK:   ret void
test_vst1q_lane_s16(int16_t * a,int16x8_t b)18231 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
18232   vst1q_lane_s16(a, b, 7);
18233 }
18234 
18235 // CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
18236 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18237 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18238 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
18239 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
18240 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18241 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18242 // CHECK:   ret void
test_vst1q_lane_s32(int32_t * a,int32x4_t b)18243 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
18244   vst1q_lane_s32(a, b, 3);
18245 }
18246 
18247 // CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
18248 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18249 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18250 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
18251 // CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
18252 // CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
18253 // CHECK:   ret void
test_vst1q_lane_s64(int64_t * a,int64x2_t b)18254 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
18255   vst1q_lane_s64(a, b, 1);
18256 }
18257 
18258 // CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
18259 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18260 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
18261 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18262 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18263 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18264 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18265 // CHECK:   ret void
test_vst1q_lane_f16(float16_t * a,float16x8_t b)18266 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
18267   vst1q_lane_f16(a, b, 7);
18268 }
18269 
18270 // CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
18271 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18272 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
18273 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
18274 // CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
18275 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
18276 // CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
18277 // CHECK:   ret void
test_vst1q_lane_f32(float32_t * a,float32x4_t b)18278 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
18279   vst1q_lane_f32(a, b, 3);
18280 }
18281 
18282 // CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
18283 // CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
18284 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
18285 // CHECK:   ret void
test_vst1q_lane_p8(poly8_t * a,poly8x16_t b)18286 void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
18287   vst1q_lane_p8(a, b, 15);
18288 }
18289 
18290 // CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
18291 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18292 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18293 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18294 // CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18295 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18296 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18297 // CHECK:   ret void
test_vst1q_lane_p16(poly16_t * a,poly16x8_t b)18298 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
18299   vst1q_lane_p16(a, b, 7);
18300 }
18301 
18302 // CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
18303 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
18304 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
18305 // CHECK:   ret void
test_vst1_lane_u8(uint8_t * a,uint8x8_t b)18306 void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
18307   vst1_lane_u8(a, b, 7);
18308 }
18309 
18310 // CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
18311 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18312 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18313 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18314 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18315 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18316 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18317 // CHECK:   ret void
test_vst1_lane_u16(uint16_t * a,uint16x4_t b)18318 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
18319   vst1_lane_u16(a, b, 3);
18320 }
18321 
18322 // CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
18323 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18324 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18325 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18326 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
18327 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18328 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18329 // CHECK:   ret void
test_vst1_lane_u32(uint32_t * a,uint32x2_t b)18330 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
18331   vst1_lane_u32(a, b, 1);
18332 }
18333 
18334 // CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
18335 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18336 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18337 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18338 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
18339 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
18340 // CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
18341 // CHECK:   ret void
test_vst1_lane_u64(uint64_t * a,uint64x1_t b)18342 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
18343   vst1_lane_u64(a, b, 0);
18344 }
18345 
18346 // CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
18347 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
18348 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
18349 // CHECK:   ret void
test_vst1_lane_s8(int8_t * a,int8x8_t b)18350 void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
18351   vst1_lane_s8(a, b, 7);
18352 }
18353 
18354 // CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
18355 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18356 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18357 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18358 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18359 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18360 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18361 // CHECK:   ret void
test_vst1_lane_s16(int16_t * a,int16x4_t b)18362 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
18363   vst1_lane_s16(a, b, 3);
18364 }
18365 
18366 // CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
18367 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18368 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18369 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18370 // CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
18371 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18372 // CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18373 // CHECK:   ret void
test_vst1_lane_s32(int32_t * a,int32x2_t b)18374 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
18375   vst1_lane_s32(a, b, 1);
18376 }
18377 
18378 // CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
18379 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18380 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18381 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18382 // CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
18383 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
18384 // CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
18385 // CHECK:   ret void
test_vst1_lane_s64(int64_t * a,int64x1_t b)18386 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
18387   vst1_lane_s64(a, b, 0);
18388 }
18389 
18390 // CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
18391 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18392 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
18393 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18394 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18395 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18396 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18397 // CHECK:   ret void
test_vst1_lane_f16(float16_t * a,float16x4_t b)18398 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
18399   vst1_lane_f16(a, b, 3);
18400 }
18401 
18402 // CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
18403 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18404 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
18405 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
18406 // CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
18407 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
18408 // CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
18409 // CHECK:   ret void
test_vst1_lane_f32(float32_t * a,float32x2_t b)18410 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
18411   vst1_lane_f32(a, b, 1);
18412 }
18413 
18414 // CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
18415 // CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
18416 // CHECK:   store i8 [[TMP0]], i8* %a, align 1
18417 // CHECK:   ret void
test_vst1_lane_p8(poly8_t * a,poly8x8_t b)18418 void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
18419   vst1_lane_p8(a, b, 7);
18420 }
18421 
18422 // CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
18423 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18424 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18425 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18426 // CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18427 // CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18428 // CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18429 // CHECK:   ret void
test_vst1_lane_p16(poly16_t * a,poly16x4_t b)18430 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
18431   vst1_lane_p16(a, b, 3);
18432 }
18433 
18434 
18435 // CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [4 x i64] %b.coerce) #0 {
18436 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
18437 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
18438 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
18439 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
18440 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18441 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
18442 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
18443 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18444 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
18445 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
18446 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
18447 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
18448 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
18449 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
18450 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
18451 // CHECK:   ret void
test_vst2q_u8(uint8_t * a,uint8x16x2_t b)18452 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
18453   vst2q_u8(a, b);
18454 }
18455 
18456 // CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [4 x i64] %b.coerce) #0 {
18457 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
18458 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
18459 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
18460 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18461 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18462 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
18463 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
18464 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18465 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18466 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18467 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18468 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18469 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18470 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18471 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18472 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18473 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18474 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18475 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18476 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18477 // CHECK:   ret void
test_vst2q_u16(uint16_t * a,uint16x8x2_t b)18478 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
18479   vst2q_u16(a, b);
18480 }
18481 
18482 // CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [4 x i64] %b.coerce) #0 {
18483 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
18484 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
18485 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
18486 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
18487 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18488 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
18489 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
18490 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18491 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18492 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
18493 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
18494 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
18495 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18496 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
18497 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
18498 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
18499 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18500 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18501 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18502 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
18503 // CHECK:   ret void
test_vst2q_u32(uint32_t * a,uint32x4x2_t b)18504 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
18505   vst2q_u32(a, b);
18506 }
18507 
18508 // CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [4 x i64] %b.coerce) #0 {
18509 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
18510 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
18511 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
18512 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
18513 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18514 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
18515 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
18516 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18517 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
18518 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
18519 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
18520 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
18521 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
18522 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
18523 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
18524 // CHECK:   ret void
test_vst2q_s8(int8_t * a,int8x16x2_t b)18525 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
18526   vst2q_s8(a, b);
18527 }
18528 
18529 // CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [4 x i64] %b.coerce) #0 {
18530 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
18531 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
18532 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
18533 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18534 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18535 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
18536 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
18537 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18538 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18539 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
18540 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18541 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18542 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18543 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
18544 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18545 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18546 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18547 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18548 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18549 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18550 // CHECK:   ret void
test_vst2q_s16(int16_t * a,int16x8x2_t b)18551 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
18552   vst2q_s16(a, b);
18553 }
18554 
18555 // CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [4 x i64] %b.coerce) #0 {
18556 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
18557 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
18558 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
18559 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
18560 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18561 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
18562 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
18563 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18564 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18565 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
18566 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
18567 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
18568 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18569 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
18570 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
18571 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
18572 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18573 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18574 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18575 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
18576 // CHECK:   ret void
test_vst2q_s32(int32_t * a,int32x4x2_t b)18577 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
18578   vst2q_s32(a, b);
18579 }
18580 
18581 // CHECK-LABEL: define void @test_vst2q_f16(half* %a, [4 x i64] %b.coerce) #0 {
18582 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
18583 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
18584 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
18585 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
18586 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18587 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
18588 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
18589 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18590 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
18591 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
18592 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
18593 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
18594 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
18595 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
18596 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
18597 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
18598 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
18599 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18600 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18601 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18602 // CHECK:   ret void
test_vst2q_f16(float16_t * a,float16x8x2_t b)18603 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
18604   vst2q_f16(a, b);
18605 }
18606 
18607 // CHECK-LABEL: define void @test_vst2q_f32(float* %a, [4 x i64] %b.coerce) #0 {
18608 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
18609 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
18610 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
18611 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
18612 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18613 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
18614 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
18615 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18616 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
18617 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
18618 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
18619 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
18620 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
18621 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
18622 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
18623 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
18624 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
18625 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
18626 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
18627 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
18628 // CHECK:   ret void
test_vst2q_f32(float32_t * a,float32x4x2_t b)18629 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
18630   vst2q_f32(a, b);
18631 }
18632 
18633 // CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [4 x i64] %b.coerce) #0 {
18634 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
18635 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
18636 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
18637 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
18638 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18639 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
18640 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
18641 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18642 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
18643 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
18644 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
18645 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
18646 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
18647 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
18648 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
18649 // CHECK:   ret void
test_vst2q_p8(poly8_t * a,poly8x16x2_t b)18650 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
18651   vst2q_p8(a, b);
18652 }
18653 
18654 // CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [4 x i64] %b.coerce) #0 {
18655 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
18656 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
18657 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
18658 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18659 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18660 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
18661 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
18662 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18663 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18664 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
18665 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18666 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18667 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18668 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
18669 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18670 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18671 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18672 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18673 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18674 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18675 // CHECK:   ret void
test_vst2q_p16(poly16_t * a,poly16x8x2_t b)18676 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
18677   vst2q_p16(a, b);
18678 }
18679 
18680 // CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x i64] %b.coerce) #0 {
18681 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
18682 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
18683 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
18684 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
18685 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18686 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
18687 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
18688 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18689 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
18690 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
18691 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
18692 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
18693 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
18694 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
18695 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
18696 // CHECK:   ret void
test_vst2_u8(uint8_t * a,uint8x8x2_t b)18697 void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
18698   vst2_u8(a, b);
18699 }
18700 
18701 // CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x i64] %b.coerce) #0 {
18702 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
18703 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
18704 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
18705 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
18706 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18707 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
18708 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
18709 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18710 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18711 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
18712 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
18713 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
18714 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18715 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
18716 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
18717 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
18718 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18719 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18720 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18721 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18722 // CHECK:   ret void
test_vst2_u16(uint16_t * a,uint16x4x2_t b)18723 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
18724   vst2_u16(a, b);
18725 }
18726 
18727 // CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x i64] %b.coerce) #0 {
18728 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
18729 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
18730 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
18731 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
18732 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18733 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
18734 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
18735 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18736 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18737 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
18738 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
18739 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
18740 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18741 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
18742 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
18743 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
18744 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18745 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18746 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18747 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
18748 // CHECK:   ret void
test_vst2_u32(uint32_t * a,uint32x2x2_t b)18749 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
18750   vst2_u32(a, b);
18751 }
18752 
18753 // CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x i64] %b.coerce) #0 {
18754 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
18755 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
18756 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
18757 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
18758 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18759 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
18760 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
18761 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18762 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
18763 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
18764 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
18765 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
18766 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
18767 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
18768 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
18769 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
18770 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
18771 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
18772 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
18773 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
18774 // CHECK:   ret void
test_vst2_u64(uint64_t * a,uint64x1x2_t b)18775 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
18776   vst2_u64(a, b);
18777 }
18778 
18779 // CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x i64] %b.coerce) #0 {
18780 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
18781 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
18782 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
18783 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
18784 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18785 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
18786 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
18787 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18788 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
18789 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
18790 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
18791 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
18792 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
18793 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
18794 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
18795 // CHECK:   ret void
test_vst2_s8(int8_t * a,int8x8x2_t b)18796 void test_vst2_s8(int8_t * a, int8x8x2_t b) {
18797   vst2_s8(a, b);
18798 }
18799 
18800 // CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x i64] %b.coerce) #0 {
18801 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
18802 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
18803 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
18804 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
18805 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18806 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
18807 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
18808 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18809 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18810 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
18811 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
18812 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
18813 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18814 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
18815 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
18816 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
18817 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18818 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18819 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18820 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18821 // CHECK:   ret void
test_vst2_s16(int16_t * a,int16x4x2_t b)18822 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
18823   vst2_s16(a, b);
18824 }
18825 
18826 // CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x i64] %b.coerce) #0 {
18827 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
18828 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
18829 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
18830 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
18831 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18832 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
18833 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
18834 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18835 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18836 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
18837 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
18838 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
18839 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18840 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
18841 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
18842 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
18843 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18844 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18845 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18846 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
18847 // CHECK:   ret void
test_vst2_s32(int32_t * a,int32x2x2_t b)18848 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
18849   vst2_s32(a, b);
18850 }
18851 
18852 // CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x i64] %b.coerce) #0 {
18853 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
18854 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
18855 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
18856 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
18857 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18858 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
18859 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
18860 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18861 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
18862 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
18863 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
18864 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
18865 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
18866 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
18867 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
18868 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
18869 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
18870 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
18871 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
18872 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
18873 // CHECK:   ret void
test_vst2_s64(int64_t * a,int64x1x2_t b)18874 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
18875   vst2_s64(a, b);
18876 }
18877 
18878 // CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x i64] %b.coerce) #0 {
18879 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
18880 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
18881 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
18882 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
18883 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18884 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
18885 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
18886 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18887 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
18888 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
18889 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
18890 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
18891 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
18892 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
18893 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
18894 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
18895 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
18896 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18897 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18898 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18899 // CHECK:   ret void
test_vst2_f16(float16_t * a,float16x4x2_t b)18900 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
18901   vst2_f16(a, b);
18902 }
18903 
18904 // CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x i64] %b.coerce) #0 {
18905 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
18906 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
18907 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
18908 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
18909 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18910 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
18911 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
18912 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18913 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
18914 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
18915 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
18916 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
18917 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
18918 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
18919 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
18920 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
18921 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
18922 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
18923 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
18924 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
18925 // CHECK:   ret void
test_vst2_f32(float32_t * a,float32x2x2_t b)18926 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
18927   vst2_f32(a, b);
18928 }
18929 
18930 // CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x i64] %b.coerce) #0 {
18931 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
18932 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
18933 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
18934 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
18935 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18936 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
18937 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
18938 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18939 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
18940 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
18941 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
18942 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
18943 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
18944 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
18945 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
18946 // CHECK:   ret void
test_vst2_p8(poly8_t * a,poly8x8x2_t b)18947 void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
18948   vst2_p8(a, b);
18949 }
18950 
18951 // CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x i64] %b.coerce) #0 {
18952 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
18953 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
18954 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
18955 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
18956 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18957 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
18958 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
18959 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18960 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18961 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
18962 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
18963 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
18964 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18965 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
18966 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
18967 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
18968 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18969 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18970 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18971 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18972 // CHECK:   ret void
test_vst2_p16(poly16_t * a,poly16x4x2_t b)18973 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
18974   vst2_p16(a, b);
18975 }
18976 
18977 
18978 // CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
18979 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
18980 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
18981 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
18982 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18983 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18984 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
18985 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
18986 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18987 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18988 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18989 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18990 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18991 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18992 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18993 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18994 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18995 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18996 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18997 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18998 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
18999 // CHECK:   ret void
test_vst2q_lane_u16(uint16_t * a,uint16x8x2_t b)19000 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
19001   vst2q_lane_u16(a, b, 7);
19002 }
19003 
19004 // CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
19005 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
19006 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
19007 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
19008 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
19009 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19010 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
19011 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
19012 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19013 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19014 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
19015 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
19016 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19017 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19018 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
19019 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19020 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19021 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19022 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19023 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19024 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
19025 // CHECK:   ret void
test_vst2q_lane_u32(uint32_t * a,uint32x4x2_t b)19026 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
19027   vst2q_lane_u32(a, b, 3);
19028 }
19029 
19030 // CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
19031 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
19032 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
19033 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
19034 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
19035 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19036 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
19037 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
19038 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19039 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19040 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
19041 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
19042 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19043 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19044 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
19045 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19046 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19047 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19048 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19049 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19050 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
19051 // CHECK:   ret void
test_vst2q_lane_s16(int16_t * a,int16x8x2_t b)19052 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
19053   vst2q_lane_s16(a, b, 7);
19054 }
19055 
19056 // CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
19057 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
19058 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
19059 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
19060 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
19061 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19062 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
19063 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
19064 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19065 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19066 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
19067 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
19068 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19069 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19070 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
19071 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19072 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19073 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19074 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19075 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19076 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
19077 // CHECK:   ret void
test_vst2q_lane_s32(int32_t * a,int32x4x2_t b)19078 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
19079   vst2q_lane_s32(a, b, 3);
19080 }
19081 
19082 // CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
19083 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
19084 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
19085 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
19086 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
19087 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19088 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
19089 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
19090 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19091 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19092 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
19093 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
19094 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
19095 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
19096 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
19097 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
19098 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
19099 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
19100 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19101 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19102 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
19103 // CHECK:   ret void
test_vst2q_lane_f16(float16_t * a,float16x8x2_t b)19104 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
19105   vst2q_lane_f16(a, b, 7);
19106 }
19107 
19108 // CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
19109 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
19110 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
19111 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
19112 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
19113 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19114 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
19115 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
19116 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19117 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19118 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
19119 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
19120 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
19121 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
19122 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
19123 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
19124 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
19125 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
19126 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
19127 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
19128 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
19129 // CHECK:   ret void
test_vst2q_lane_f32(float32_t * a,float32x4x2_t b)19130 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
19131   vst2q_lane_f32(a, b, 3);
19132 }
19133 
19134 // CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
19135 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
19136 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
19137 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
19138 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
19139 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19140 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
19141 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
19142 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19143 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19144 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
19145 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
19146 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19147 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19148 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
19149 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19150 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19151 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19152 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19153 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19154 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
19155 // CHECK:   ret void
test_vst2q_lane_p16(poly16_t * a,poly16x8x2_t b)19156 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
19157   vst2q_lane_p16(a, b, 7);
19158 }
19159 
19160 // CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x i64] %b.coerce) #0 {
19161 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
19162 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
19163 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
19164 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
19165 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19166 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
19167 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
19168 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19169 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
19170 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
19171 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19172 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
19173 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19174 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19175 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
19176 // CHECK:   ret void
test_vst2_lane_u8(uint8_t * a,uint8x8x2_t b)19177 void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
19178   vst2_lane_u8(a, b, 7);
19179 }
19180 
19181 // CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x i64] %b.coerce) #0 {
19182 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
19183 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
19184 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
19185 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
19186 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19187 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
19188 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
19189 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19190 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19191 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
19192 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
19193 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19194 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19195 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
19196 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19197 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19198 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19199 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19200 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19201 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19202 // CHECK:   ret void
test_vst2_lane_u16(uint16_t * a,uint16x4x2_t b)19203 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
19204   vst2_lane_u16(a, b, 3);
19205 }
19206 
19207 // CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x i64] %b.coerce) #0 {
19208 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
19209 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
19210 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
19211 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
19212 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19213 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
19214 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
19215 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19216 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19217 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
19218 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
19219 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19220 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19221 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
19222 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19223 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19224 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19225 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19226 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19227 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
19228 // CHECK:   ret void
test_vst2_lane_u32(uint32_t * a,uint32x2x2_t b)19229 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
19230   vst2_lane_u32(a, b, 1);
19231 }
19232 
19233 // CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x i64] %b.coerce) #0 {
19234 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
19235 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
19236 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
19237 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
19238 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19239 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
19240 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
19241 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19242 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
19243 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
19244 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19245 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
19246 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19247 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19248 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
19249 // CHECK:   ret void
test_vst2_lane_s8(int8_t * a,int8x8x2_t b)19250 void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
19251   vst2_lane_s8(a, b, 7);
19252 }
19253 
19254 // CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x i64] %b.coerce) #0 {
19255 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
19256 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
19257 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
19258 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
19259 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19260 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
19261 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
19262 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19263 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19264 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
19265 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
19266 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19267 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19268 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
19269 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19270 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19271 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19272 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19273 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19274 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19275 // CHECK:   ret void
test_vst2_lane_s16(int16_t * a,int16x4x2_t b)19276 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
19277   vst2_lane_s16(a, b, 3);
19278 }
19279 
19280 // CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x i64] %b.coerce) #0 {
19281 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
19282 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
19283 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
19284 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
19285 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19286 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
19287 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
19288 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19289 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19290 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
19291 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
19292 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19293 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19294 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
19295 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19296 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19297 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19298 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19299 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19300 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
19301 // CHECK:   ret void
test_vst2_lane_s32(int32_t * a,int32x2x2_t b)19302 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
19303   vst2_lane_s32(a, b, 1);
19304 }
19305 
19306 // CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x i64] %b.coerce) #0 {
19307 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
19308 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
19309 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
19310 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
19311 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19312 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
19313 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
19314 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19315 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19316 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
19317 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
19318 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
19319 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
19320 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
19321 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
19322 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
19323 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
19324 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19325 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19326 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19327 // CHECK:   ret void
test_vst2_lane_f16(float16_t * a,float16x4x2_t b)19328 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
19329   vst2_lane_f16(a, b, 3);
19330 }
19331 
19332 // CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x i64] %b.coerce) #0 {
19333 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
19334 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
19335 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
19336 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
19337 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19338 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
19339 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
19340 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19341 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19342 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
19343 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
19344 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
19345 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
19346 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
19347 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
19348 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
19349 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
19350 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
19351 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
19352 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
19353 // CHECK:   ret void
test_vst2_lane_f32(float32_t * a,float32x2x2_t b)19354 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
19355   vst2_lane_f32(a, b, 1);
19356 }
19357 
19358 // CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x i64] %b.coerce) #0 {
19359 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
19360 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
19361 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
19362 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
19363 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19364 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
19365 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
19366 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19367 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
19368 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
19369 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19370 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
19371 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19372 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19373 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
19374 // CHECK:   ret void
test_vst2_lane_p8(poly8_t * a,poly8x8x2_t b)19375 void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
19376   vst2_lane_p8(a, b, 7);
19377 }
19378 
19379 // CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x i64] %b.coerce) #0 {
19380 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
19381 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
19382 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
19383 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
19384 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19385 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
19386 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
19387 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19388 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19389 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
19390 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
19391 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19392 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19393 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
19394 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19395 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19396 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19397 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19398 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19399 // CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19400 // CHECK:   ret void
test_vst2_lane_p16(poly16_t * a,poly16x4x2_t b)19401 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
19402   vst2_lane_p16(a, b, 3);
19403 }
19404 
19405 
19406 // CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [6 x i64] %b.coerce) #0 {
19407 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
19408 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
19409 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
19410 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
19411 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19412 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
19413 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
19414 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19415 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
19416 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
19417 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
19418 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
19419 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
19420 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
19421 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
19422 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
19423 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
19424 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
19425 // CHECK:   ret void
test_vst3q_u8(uint8_t * a,uint8x16x3_t b)19426 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
19427   vst3q_u8(a, b);
19428 }
19429 
19430 // CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [6 x i64] %b.coerce) #0 {
19431 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
19432 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
19433 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
19434 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
19435 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19436 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
19437 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
19438 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19439 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19440 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
19441 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
19442 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19443 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19444 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
19445 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19446 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19447 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19448 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
19449 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
19450 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
19451 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
19452 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19453 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19454 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19455 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19456 // CHECK:   ret void
test_vst3q_u16(uint16_t * a,uint16x8x3_t b)19457 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
19458   vst3q_u16(a, b);
19459 }
19460 
19461 // CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [6 x i64] %b.coerce) #0 {
19462 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
19463 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
19464 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
19465 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
19466 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19467 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
19468 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
19469 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19470 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19471 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
19472 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
19473 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19474 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19475 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
19476 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19477 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19478 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19479 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
19480 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
19481 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
19482 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
19483 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19484 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19485 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
19486 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
19487 // CHECK:   ret void
test_vst3q_u32(uint32_t * a,uint32x4x3_t b)19488 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
19489   vst3q_u32(a, b);
19490 }
19491 
19492 // CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [6 x i64] %b.coerce) #0 {
19493 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
19494 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
19495 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
19496 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
19497 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19498 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
19499 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
19500 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19501 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
19502 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
19503 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
19504 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
19505 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
19506 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
19507 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
19508 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
19509 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
19510 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
19511 // CHECK:   ret void
test_vst3q_s8(int8_t * a,int8x16x3_t b)19512 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
19513   vst3q_s8(a, b);
19514 }
19515 
19516 // CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [6 x i64] %b.coerce) #0 {
19517 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
19518 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
19519 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
19520 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
19521 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19522 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
19523 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
19524 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19525 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19526 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
19527 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
19528 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19529 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19530 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
19531 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19532 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19533 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19534 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
19535 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
19536 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
19537 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
19538 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19539 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19540 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19541 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19542 // CHECK:   ret void
test_vst3q_s16(int16_t * a,int16x8x3_t b)19543 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
19544   vst3q_s16(a, b);
19545 }
19546 
19547 // CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [6 x i64] %b.coerce) #0 {
19548 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
19549 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
19550 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
19551 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
19552 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19553 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
19554 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
19555 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19556 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19557 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
19558 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
19559 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19560 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19561 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
19562 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19563 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19564 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19565 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
19566 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
19567 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
19568 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
19569 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19570 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19571 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
19572 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
19573 // CHECK:   ret void
test_vst3q_s32(int32_t * a,int32x4x3_t b)19574 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
19575   vst3q_s32(a, b);
19576 }
19577 
19578 // CHECK-LABEL: define void @test_vst3q_f16(half* %a, [6 x i64] %b.coerce) #0 {
19579 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
19580 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
19581 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
19582 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
19583 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19584 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
19585 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
19586 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19587 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19588 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
19589 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
19590 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
19591 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
19592 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
19593 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
19594 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
19595 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
19596 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
19597 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
19598 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
19599 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
19600 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19601 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19602 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19603 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19604 // CHECK:   ret void
test_vst3q_f16(float16_t * a,float16x8x3_t b)19605 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
19606   vst3q_f16(a, b);
19607 }
19608 
19609 // CHECK-LABEL: define void @test_vst3q_f32(float* %a, [6 x i64] %b.coerce) #0 {
19610 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
19611 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
19612 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
19613 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
19614 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19615 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
19616 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
19617 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19618 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19619 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
19620 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
19621 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
19622 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
19623 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
19624 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
19625 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
19626 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
19627 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
19628 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
19629 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
19630 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
19631 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
19632 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
19633 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
19634 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
19635 // CHECK:   ret void
test_vst3q_f32(float32_t * a,float32x4x3_t b)19636 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
19637   vst3q_f32(a, b);
19638 }
19639 
19640 // CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [6 x i64] %b.coerce) #0 {
19641 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
19642 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
19643 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
19644 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
19645 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19646 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
19647 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
19648 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19649 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
19650 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
19651 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
19652 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
19653 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
19654 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
19655 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
19656 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
19657 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
19658 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
19659 // CHECK:   ret void
test_vst3q_p8(poly8_t * a,poly8x16x3_t b)19660 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
19661   vst3q_p8(a, b);
19662 }
19663 
19664 // CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [6 x i64] %b.coerce) #0 {
19665 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
19666 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
19667 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
19668 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
19669 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19670 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
19671 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
19672 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19673 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19674 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
19675 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
19676 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19677 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19678 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
19679 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19680 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19681 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19682 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
19683 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
19684 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
19685 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
19686 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19687 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19688 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19689 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19690 // CHECK:   ret void
test_vst3q_p16(poly16_t * a,poly16x8x3_t b)19691 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
19692   vst3q_p16(a, b);
19693 }
19694 
19695 // CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x i64] %b.coerce) #0 {
19696 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
19697 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
19698 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
19699 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
19700 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19701 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
19702 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
19703 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19704 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
19705 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
19706 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19707 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
19708 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19709 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19710 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
19711 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
19712 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
19713 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
19714 // CHECK:   ret void
test_vst3_u8(uint8_t * a,uint8x8x3_t b)19715 void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
19716   vst3_u8(a, b);
19717 }
19718 
19719 // CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x i64] %b.coerce) #0 {
19720 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
19721 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
19722 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
19723 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
19724 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19725 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
19726 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
19727 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19728 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19729 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
19730 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
19731 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19732 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19733 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
19734 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19735 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19736 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19737 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
19738 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
19739 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
19740 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
19741 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19742 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19743 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19744 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
19745 // CHECK:   ret void
test_vst3_u16(uint16_t * a,uint16x4x3_t b)19746 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
19747   vst3_u16(a, b);
19748 }
19749 
19750 // CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x i64] %b.coerce) #0 {
19751 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
19752 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
19753 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
19754 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
19755 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19756 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
19757 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
19758 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19759 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19760 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
19761 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
19762 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19763 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19764 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
19765 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19766 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19767 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19768 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
19769 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
19770 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
19771 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
19772 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19773 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19774 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
19775 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
19776 // CHECK:   ret void
test_vst3_u32(uint32_t * a,uint32x2x3_t b)19777 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
19778   vst3_u32(a, b);
19779 }
19780 
19781 // CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x i64] %b.coerce) #0 {
19782 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
19783 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
19784 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
19785 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
19786 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19787 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
19788 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
19789 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19790 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
19791 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
19792 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
19793 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
19794 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
19795 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
19796 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
19797 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
19798 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
19799 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
19800 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
19801 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
19802 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
19803 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
19804 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
19805 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
19806 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
19807 // CHECK:   ret void
test_vst3_u64(uint64_t * a,uint64x1x3_t b)19808 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
19809   vst3_u64(a, b);
19810 }
19811 
19812 // CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x i64] %b.coerce) #0 {
19813 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
19814 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
19815 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
19816 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
19817 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19818 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
19819 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
19820 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19821 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
19822 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
19823 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19824 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
19825 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19826 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19827 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
19828 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
19829 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
19830 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
19831 // CHECK:   ret void
test_vst3_s8(int8_t * a,int8x8x3_t b)19832 void test_vst3_s8(int8_t * a, int8x8x3_t b) {
19833   vst3_s8(a, b);
19834 }
19835 
19836 // CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x i64] %b.coerce) #0 {
19837 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
19838 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
19839 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
19840 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
19841 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19842 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
19843 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
19844 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19845 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19846 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
19847 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
19848 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19849 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19850 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
19851 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19852 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19853 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19854 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
19855 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
19856 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
19857 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
19858 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19859 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19860 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19861 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
19862 // CHECK:   ret void
test_vst3_s16(int16_t * a,int16x4x3_t b)19863 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
19864   vst3_s16(a, b);
19865 }
19866 
19867 // CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x i64] %b.coerce) #0 {
19868 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
19869 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
19870 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
19871 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
19872 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19873 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
19874 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
19875 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19876 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19877 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
19878 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
19879 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19880 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19881 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
19882 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19883 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19884 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19885 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
19886 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
19887 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
19888 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
19889 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19890 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19891 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
19892 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
19893 // CHECK:   ret void
test_vst3_s32(int32_t * a,int32x2x3_t b)19894 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
19895   vst3_s32(a, b);
19896 }
19897 
19898 // CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x i64] %b.coerce) #0 {
19899 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
19900 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
19901 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
19902 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
19903 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19904 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
19905 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
19906 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19907 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
19908 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
19909 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
19910 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
19911 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
19912 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
19913 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
19914 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
19915 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
19916 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
19917 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
19918 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
19919 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
19920 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
19921 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
19922 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
19923 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
19924 // CHECK:   ret void
test_vst3_s64(int64_t * a,int64x1x3_t b)19925 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
19926   vst3_s64(a, b);
19927 }
19928 
19929 // CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x i64] %b.coerce) #0 {
19930 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
19931 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
19932 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
19933 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
19934 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19935 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
19936 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
19937 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19938 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19939 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
19940 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
19941 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
19942 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
19943 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
19944 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
19945 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
19946 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
19947 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
19948 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
19949 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
19950 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
19951 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19952 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19953 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19954 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
19955 // CHECK:   ret void
test_vst3_f16(float16_t * a,float16x4x3_t b)19956 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
19957   vst3_f16(a, b);
19958 }
19959 
19960 // CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x i64] %b.coerce) #0 {
19961 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
19962 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
19963 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
19964 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
19965 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19966 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
19967 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
19968 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19969 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19970 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
19971 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
19972 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
19973 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
19974 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
19975 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
19976 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
19977 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
19978 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
19979 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
19980 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
19981 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
19982 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
19983 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
19984 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
19985 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
19986 // CHECK:   ret void
test_vst3_f32(float32_t * a,float32x2x3_t b)19987 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
19988   vst3_f32(a, b);
19989 }
19990 
19991 // CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x i64] %b.coerce) #0 {
19992 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
19993 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
19994 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
19995 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
19996 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19997 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
19998 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
19999 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20000 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20001 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20002 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20003 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20004 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20005 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20006 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20007 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20008 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20009 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
20010 // CHECK:   ret void
test_vst3_p8(poly8_t * a,poly8x8x3_t b)20011 void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
20012   vst3_p8(a, b);
20013 }
20014 
20015 // CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x i64] %b.coerce) #0 {
20016 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
20017 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
20018 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
20019 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20020 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20021 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
20022 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
20023 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20024 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20025 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20026 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20027 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20028 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20029 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20030 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20031 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20032 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20033 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20034 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20035 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20036 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20037 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20038 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20039 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20040 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
20041 // CHECK:   ret void
test_vst3_p16(poly16_t * a,poly16x4x3_t b)20042 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
20043   vst3_p16(a, b);
20044 }
20045 
20046 
20047 // CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [6 x i64] %b.coerce) #0 {
20048 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
20049 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
20050 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
20051 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
20052 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20053 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
20054 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
20055 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20056 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20057 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
20058 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
20059 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20060 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20061 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
20062 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20063 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20064 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20065 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
20066 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20067 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20068 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20069 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20070 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20071 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20072 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20073 // CHECK:   ret void
test_vst3q_lane_u16(uint16_t * a,uint16x8x3_t b)20074 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
20075   vst3q_lane_u16(a, b, 7);
20076 }
20077 
20078 // CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [6 x i64] %b.coerce) #0 {
20079 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
20080 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
20081 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
20082 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
20083 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20084 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
20085 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
20086 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20087 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20088 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
20089 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
20090 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20091 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20092 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
20093 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20094 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20095 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20096 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
20097 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20098 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20099 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20100 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20101 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20102 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20103 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
20104 // CHECK:   ret void
test_vst3q_lane_u32(uint32_t * a,uint32x4x3_t b)20105 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
20106   vst3q_lane_u32(a, b, 3);
20107 }
20108 
20109 // CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [6 x i64] %b.coerce) #0 {
20110 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
20111 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
20112 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
20113 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
20114 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20115 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
20116 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
20117 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20118 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20119 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
20120 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
20121 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20122 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20123 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
20124 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20125 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20126 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20127 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
20128 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20129 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20130 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20131 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20132 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20133 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20134 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20135 // CHECK:   ret void
test_vst3q_lane_s16(int16_t * a,int16x8x3_t b)20136 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
20137   vst3q_lane_s16(a, b, 7);
20138 }
20139 
20140 // CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [6 x i64] %b.coerce) #0 {
20141 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
20142 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
20143 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
20144 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
20145 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20146 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
20147 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
20148 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20149 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20150 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
20151 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
20152 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20153 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20154 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
20155 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20156 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20157 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20158 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
20159 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20160 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20161 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20162 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20163 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20164 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20165 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
20166 // CHECK:   ret void
test_vst3q_lane_s32(int32_t * a,int32x4x3_t b)20167 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
20168   vst3q_lane_s32(a, b, 3);
20169 }
20170 
20171 // CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [6 x i64] %b.coerce) #0 {
20172 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
20173 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
20174 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
20175 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
20176 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20177 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
20178 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
20179 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20180 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
20181 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
20182 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
20183 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
20184 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
20185 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
20186 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
20187 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
20188 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
20189 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
20190 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
20191 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
20192 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
20193 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20194 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20195 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20196 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20197 // CHECK:   ret void
test_vst3q_lane_f16(float16_t * a,float16x8x3_t b)20198 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
20199   vst3q_lane_f16(a, b, 7);
20200 }
20201 
20202 // CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [6 x i64] %b.coerce) #0 {
20203 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
20204 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
20205 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
20206 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
20207 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20208 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
20209 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
20210 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20211 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
20212 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
20213 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
20214 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
20215 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
20216 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
20217 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
20218 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
20219 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
20220 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
20221 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
20222 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
20223 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
20224 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
20225 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
20226 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
20227 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
20228 // CHECK:   ret void
test_vst3q_lane_f32(float32_t * a,float32x4x3_t b)20229 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
20230   vst3q_lane_f32(a, b, 3);
20231 }
20232 
20233 // CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [6 x i64] %b.coerce) #0 {
20234 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
20235 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
20236 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
20237 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
20238 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20239 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
20240 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
20241 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20242 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20243 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
20244 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
20245 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20246 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20247 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
20248 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20249 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20250 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20251 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
20252 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20253 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20254 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20255 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20256 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20257 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20258 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20259 // CHECK:   ret void
test_vst3q_lane_p16(poly16_t * a,poly16x8x3_t b)20260 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
20261   vst3q_lane_p16(a, b, 7);
20262 }
20263 
20264 // CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x i64] %b.coerce) #0 {
20265 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
20266 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
20267 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
20268 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
20269 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20270 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
20271 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
20272 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20273 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
20274 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20275 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20276 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
20277 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20278 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20279 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
20280 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20281 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20282 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
20283 // CHECK:   ret void
test_vst3_lane_u8(uint8_t * a,uint8x8x3_t b)20284 void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
20285   vst3_lane_u8(a, b, 7);
20286 }
20287 
20288 // CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x i64] %b.coerce) #0 {
20289 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
20290 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
20291 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
20292 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20293 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20294 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
20295 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
20296 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20297 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20298 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
20299 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20300 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20301 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20302 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
20303 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20304 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20305 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20306 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
20307 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20308 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20309 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20310 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20311 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20312 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20313 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20314 // CHECK:   ret void
test_vst3_lane_u16(uint16_t * a,uint16x4x3_t b)20315 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
20316   vst3_lane_u16(a, b, 3);
20317 }
20318 
20319 // CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x i64] %b.coerce) #0 {
20320 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
20321 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
20322 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
20323 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
20324 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20325 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
20326 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
20327 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20328 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20329 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
20330 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
20331 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
20332 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
20333 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
20334 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
20335 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
20336 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
20337 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
20338 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
20339 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
20340 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
20341 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
20342 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
20343 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
20344 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
20345 // CHECK:   ret void
test_vst3_lane_u32(uint32_t * a,uint32x2x3_t b)20346 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
20347   vst3_lane_u32(a, b, 1);
20348 }
20349 
20350 // CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x i64] %b.coerce) #0 {
20351 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
20352 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
20353 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
20354 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
20355 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20356 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
20357 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
20358 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20359 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
20360 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20361 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20362 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
20363 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20364 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20365 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
20366 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20367 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20368 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
20369 // CHECK:   ret void
test_vst3_lane_s8(int8_t * a,int8x8x3_t b)20370 void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
20371   vst3_lane_s8(a, b, 7);
20372 }
20373 
20374 // CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x i64] %b.coerce) #0 {
20375 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
20376 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
20377 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
20378 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20379 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20380 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
20381 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
20382 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20383 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20384 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
20385 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20386 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20387 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20388 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
20389 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20390 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20391 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20392 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
20393 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20394 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20395 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20396 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20397 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20398 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20399 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20400 // CHECK:   ret void
test_vst3_lane_s16(int16_t * a,int16x4x3_t b)20401 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
20402   vst3_lane_s16(a, b, 3);
20403 }
20404 
20405 // CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x i64] %b.coerce) #0 {
20406 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
20407 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
20408 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
20409 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
20410 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20411 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
20412 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
20413 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20414 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20415 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
20416 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
20417 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
20418 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
20419 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
20420 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
20421 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
20422 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
20423 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
20424 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
20425 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
20426 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
20427 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
20428 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
20429 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
20430 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
20431 // CHECK:   ret void
test_vst3_lane_s32(int32_t * a,int32x2x3_t b)20432 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
20433   vst3_lane_s32(a, b, 1);
20434 }
20435 
20436 // CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x i64] %b.coerce) #0 {
20437 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
20438 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
20439 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
20440 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
20441 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20442 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
20443 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
20444 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20445 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
20446 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
20447 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
20448 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
20449 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
20450 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
20451 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
20452 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
20453 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
20454 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
20455 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
20456 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
20457 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
20458 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20459 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20460 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20461 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20462 // CHECK:   ret void
test_vst3_lane_f16(float16_t * a,float16x4x3_t b)20463 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
20464   vst3_lane_f16(a, b, 3);
20465 }
20466 
20467 // CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x i64] %b.coerce) #0 {
20468 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
20469 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
20470 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
20471 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
20472 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20473 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
20474 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
20475 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20476 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
20477 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
20478 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
20479 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
20480 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
20481 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
20482 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
20483 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
20484 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
20485 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
20486 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
20487 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
20488 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
20489 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
20490 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
20491 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
20492 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
20493 // CHECK:   ret void
test_vst3_lane_f32(float32_t * a,float32x2x3_t b)20494 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
20495   vst3_lane_f32(a, b, 1);
20496 }
20497 
20498 // CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x i64] %b.coerce) #0 {
20499 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
20500 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
20501 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
20502 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
20503 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20504 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
20505 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
20506 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20507 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20508 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20509 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20510 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20511 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20512 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20513 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20514 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20515 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20516 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
20517 // CHECK:   ret void
test_vst3_lane_p8(poly8_t * a,poly8x8x3_t b)20518 void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
20519   vst3_lane_p8(a, b, 7);
20520 }
20521 
20522 // CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x i64] %b.coerce) #0 {
20523 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
20524 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
20525 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
20526 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20527 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20528 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
20529 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
20530 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20531 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20532 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20533 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20534 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20535 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20536 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20537 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20538 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20539 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20540 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20541 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20542 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20543 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20544 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20545 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20546 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20547 // CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20548 // CHECK:   ret void
test_vst3_lane_p16(poly16_t * a,poly16x4x3_t b)20549 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
20550   vst3_lane_p16(a, b, 3);
20551 }
20552 
20553 
20554 // CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [8 x i64] %b.coerce) #0 {
20555 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
20556 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
20557 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
20558 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
20559 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20560 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
20561 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
20562 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20563 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20564 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
20565 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
20566 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20567 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
20568 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
20569 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20570 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
20571 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
20572 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20573 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
20574 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
20575 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
20576 // CHECK:   ret void
test_vst4q_u8(uint8_t * a,uint8x16x4_t b)20577 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
20578   vst4q_u8(a, b);
20579 }
20580 
20581 // CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [8 x i64] %b.coerce) #0 {
20582 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
20583 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
20584 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
20585 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
20586 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20587 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
20588 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
20589 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20590 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20591 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20592 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
20593 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20594 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20595 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20596 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20597 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20598 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20599 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20600 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20601 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20602 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20603 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20604 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
20605 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
20606 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
20607 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20608 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20609 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20610 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20611 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20612 // CHECK:   ret void
test_vst4q_u16(uint16_t * a,uint16x8x4_t b)20613 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
20614   vst4q_u16(a, b);
20615 }
20616 
20617 // CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [8 x i64] %b.coerce) #0 {
20618 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
20619 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
20620 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
20621 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
20622 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20623 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
20624 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
20625 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20626 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20627 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20628 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
20629 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20630 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20631 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20632 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20633 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20634 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20635 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20636 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20637 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20638 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20639 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20640 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
20641 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
20642 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
20643 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20644 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20645 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20646 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
20647 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
20648 // CHECK:   ret void
test_vst4q_u32(uint32_t * a,uint32x4x4_t b)20649 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
20650   vst4q_u32(a, b);
20651 }
20652 
20653 // CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [8 x i64] %b.coerce) #0 {
20654 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
20655 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
20656 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
20657 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
20658 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20659 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
20660 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
20661 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20662 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20663 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
20664 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
20665 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20666 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
20667 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
20668 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20669 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
20670 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
20671 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20672 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
20673 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
20674 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
20675 // CHECK:   ret void
test_vst4q_s8(int8_t * a,int8x16x4_t b)20676 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
20677   vst4q_s8(a, b);
20678 }
20679 
20680 // CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [8 x i64] %b.coerce) #0 {
20681 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
20682 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
20683 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
20684 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
20685 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20686 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
20687 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
20688 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20689 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20690 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20691 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
20692 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20693 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20694 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20695 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20696 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20697 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20698 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20699 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20700 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20701 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20702 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20703 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
20704 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
20705 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
20706 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20707 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20708 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20709 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20710 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20711 // CHECK:   ret void
test_vst4q_s16(int16_t * a,int16x8x4_t b)20712 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
20713   vst4q_s16(a, b);
20714 }
20715 
20716 // CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [8 x i64] %b.coerce) #0 {
20717 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
20718 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
20719 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
20720 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
20721 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20722 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
20723 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
20724 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20725 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20726 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20727 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
20728 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20729 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20730 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20731 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20732 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20733 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20734 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20735 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20736 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20737 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20738 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20739 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
20740 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
20741 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
20742 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20743 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20744 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20745 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
20746 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
20747 // CHECK:   ret void
test_vst4q_s32(int32_t * a,int32x4x4_t b)20748 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
20749   vst4q_s32(a, b);
20750 }
20751 
20752 // CHECK-LABEL: define void @test_vst4q_f16(half* %a, [8 x i64] %b.coerce) #0 {
20753 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
20754 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
20755 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
20756 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
20757 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20758 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
20759 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
20760 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20761 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
20762 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20763 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
20764 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
20765 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
20766 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20767 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
20768 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
20769 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
20770 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20771 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
20772 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
20773 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
20774 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20775 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
20776 // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
20777 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
20778 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20779 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20780 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20781 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20782 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20783 // CHECK:   ret void
test_vst4q_f16(float16_t * a,float16x8x4_t b)20784 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
20785   vst4q_f16(a, b);
20786 }
20787 
20788 // CHECK-LABEL: define void @test_vst4q_f32(float* %a, [8 x i64] %b.coerce) #0 {
20789 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
20790 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
20791 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
20792 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
20793 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20794 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
20795 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
20796 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20797 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
20798 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20799 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
20800 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
20801 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
20802 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20803 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
20804 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
20805 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
20806 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20807 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
20808 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
20809 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
20810 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20811 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
20812 // CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
20813 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
20814 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
20815 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
20816 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
20817 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
20818 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
20819 // CHECK:   ret void
test_vst4q_f32(float32_t * a,float32x4x4_t b)20820 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
20821   vst4q_f32(a, b);
20822 }
20823 
20824 // CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [8 x i64] %b.coerce) #0 {
20825 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
20826 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
20827 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
20828 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
20829 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20830 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
20831 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
20832 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20833 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20834 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
20835 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
20836 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20837 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
20838 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
20839 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20840 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
20841 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
20842 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20843 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
20844 // CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
20845 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
20846 // CHECK:   ret void
test_vst4q_p8(poly8_t * a,poly8x16x4_t b)20847 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
20848   vst4q_p8(a, b);
20849 }
20850 
20851 // CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [8 x i64] %b.coerce) #0 {
20852 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
20853 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
20854 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
20855 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
20856 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20857 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
20858 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
20859 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20860 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20861 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20862 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
20863 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20864 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20865 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20866 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20867 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20868 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20869 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20870 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20871 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20872 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20873 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20874 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
20875 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
20876 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
20877 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20878 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20879 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20880 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20881 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20882 // CHECK:   ret void
test_vst4q_p16(poly16_t * a,poly16x8x4_t b)20883 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
20884   vst4q_p16(a, b);
20885 }
20886 
20887 // CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x i64] %b.coerce) #0 {
20888 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
20889 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
20890 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
20891 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
20892 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20893 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
20894 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
20895 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20896 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20897 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
20898 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20899 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20900 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20901 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20902 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20903 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20904 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20905 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20906 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
20907 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
20908 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
20909 // CHECK:   ret void
test_vst4_u8(uint8_t * a,uint8x8x4_t b)20910 void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
20911   vst4_u8(a, b);
20912 }
20913 
20914 // CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x i64] %b.coerce) #0 {
20915 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
20916 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
20917 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
20918 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
20919 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20920 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
20921 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
20922 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20923 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20924 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20925 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
20926 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20927 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20928 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20929 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20930 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20931 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20932 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20933 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20934 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20935 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20936 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20937 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
20938 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
20939 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
20940 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20941 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20942 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20943 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
20944 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
20945 // CHECK:   ret void
test_vst4_u16(uint16_t * a,uint16x4x4_t b)20946 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
20947   vst4_u16(a, b);
20948 }
20949 
20950 // CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x i64] %b.coerce) #0 {
20951 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
20952 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
20953 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
20954 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
20955 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20956 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
20957 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
20958 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20959 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20960 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20961 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
20962 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
20963 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
20964 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20965 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
20966 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
20967 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
20968 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20969 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
20970 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
20971 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
20972 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20973 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
20974 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
20975 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
20976 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
20977 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
20978 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
20979 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
20980 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
20981 // CHECK:   ret void
test_vst4_u32(uint32_t * a,uint32x2x4_t b)20982 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
20983   vst4_u32(a, b);
20984 }
20985 
20986 // CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x i64] %b.coerce) #0 {
20987 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
20988 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
20989 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
20990 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
20991 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20992 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
20993 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
20994 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20995 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
20996 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
20997 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
20998 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
20999 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
21000 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
21001 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
21002 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
21003 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
21004 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
21005 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
21006 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
21007 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
21008 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
21009 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
21010 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
21011 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
21012 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
21013 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
21014 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
21015 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
21016 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
21017 // CHECK:   ret void
test_vst4_u64(uint64_t * a,uint64x1x4_t b)21018 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
21019   vst4_u64(a, b);
21020 }
21021 
21022 // CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x i64] %b.coerce) #0 {
21023 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
21024 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
21025 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
21026 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21027 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21028 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
21029 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
21030 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21031 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21032 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21033 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21034 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21035 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21036 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21037 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21038 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21039 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21040 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21041 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21042 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21043 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
21044 // CHECK:   ret void
test_vst4_s8(int8_t * a,int8x8x4_t b)21045 void test_vst4_s8(int8_t * a, int8x8x4_t b) {
21046   vst4_s8(a, b);
21047 }
21048 
21049 // CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x i64] %b.coerce) #0 {
21050 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
21051 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
21052 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
21053 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21054 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21055 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
21056 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
21057 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21058 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21059 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21060 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21061 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21062 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21063 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21064 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21065 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21066 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21067 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21068 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21069 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21070 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21071 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21072 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21073 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21074 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21075 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21076 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21077 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21078 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21079 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
21080 // CHECK:   ret void
test_vst4_s16(int16_t * a,int16x4x4_t b)21081 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
21082   vst4_s16(a, b);
21083 }
21084 
21085 // CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x i64] %b.coerce) #0 {
21086 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
21087 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
21088 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
21089 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
21090 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21091 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
21092 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
21093 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21094 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21095 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21096 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
21097 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
21098 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
21099 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21100 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
21101 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
21102 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
21103 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21104 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
21105 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
21106 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
21107 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21108 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
21109 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
21110 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
21111 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
21112 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
21113 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
21114 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
21115 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
21116 // CHECK:   ret void
test_vst4_s32(int32_t * a,int32x2x4_t b)21117 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
21118   vst4_s32(a, b);
21119 }
21120 
21121 // CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x i64] %b.coerce) #0 {
21122 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
21123 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
21124 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
21125 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
21126 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21127 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
21128 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
21129 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21130 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
21131 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21132 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
21133 // CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
21134 // CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
21135 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21136 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
21137 // CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
21138 // CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
21139 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21140 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
21141 // CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
21142 // CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
21143 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21144 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
21145 // CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
21146 // CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
21147 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
21148 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
21149 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
21150 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
21151 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
21152 // CHECK:   ret void
test_vst4_s64(int64_t * a,int64x1x4_t b)21153 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
21154   vst4_s64(a, b);
21155 }
21156 
21157 // CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x i64] %b.coerce) #0 {
21158 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
21159 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
21160 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
21161 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
21162 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21163 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
21164 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
21165 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21166 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
21167 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21168 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
21169 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
21170 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
21171 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21172 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
21173 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
21174 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
21175 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21176 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
21177 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
21178 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
21179 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21180 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
21181 // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
21182 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
21183 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21184 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21185 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21186 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21187 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
21188 // CHECK:   ret void
test_vst4_f16(float16_t * a,float16x4x4_t b)21189 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
21190   vst4_f16(a, b);
21191 }
21192 
21193 // CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x i64] %b.coerce) #0 {
21194 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
21195 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
21196 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
21197 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
21198 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21199 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
21200 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
21201 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21202 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
21203 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21204 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
21205 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
21206 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
21207 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21208 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
21209 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
21210 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
21211 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21212 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
21213 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
21214 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
21215 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21216 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
21217 // CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
21218 // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
21219 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
21220 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
21221 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
21222 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
21223 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
21224 // CHECK:   ret void
test_vst4_f32(float32_t * a,float32x2x4_t b)21225 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
21226   vst4_f32(a, b);
21227 }
21228 
21229 // CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x i64] %b.coerce) #0 {
21230 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
21231 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
21232 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
21233 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21234 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21235 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
21236 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
21237 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21238 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21239 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21240 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21241 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21242 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21243 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21244 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21245 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21246 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21247 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21248 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21249 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21250 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
21251 // CHECK:   ret void
test_vst4_p8(poly8_t * a,poly8x8x4_t b)21252 void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
21253   vst4_p8(a, b);
21254 }
21255 
21256 // CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x i64] %b.coerce) #0 {
21257 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
21258 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
21259 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
21260 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21261 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21262 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
21263 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
21264 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21265 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21266 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21267 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21268 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21269 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21270 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21271 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21272 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21273 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21274 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21275 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21276 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21277 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21278 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21279 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21280 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21281 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21282 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21283 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21284 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21285 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21286 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
21287 // CHECK:   ret void
test_vst4_p16(poly16_t * a,poly16x4x4_t b)21288 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
21289   vst4_p16(a, b);
21290 }
21291 
21292 
21293 // CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [8 x i64] %b.coerce) #0 {
21294 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
21295 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
21296 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
21297 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
21298 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21299 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
21300 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
21301 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21302 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21303 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21304 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
21305 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
21306 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
21307 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21308 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
21309 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
21310 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
21311 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21312 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
21313 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
21314 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
21315 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21316 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
21317 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
21318 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
21319 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21320 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21321 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21322 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21323 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21324 // CHECK:   ret void
test_vst4q_lane_u16(uint16_t * a,uint16x8x4_t b)21325 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
21326   vst4q_lane_u16(a, b, 7);
21327 }
21328 
21329 // CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [8 x i64] %b.coerce) #0 {
21330 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
21331 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
21332 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
21333 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
21334 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21335 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
21336 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
21337 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21338 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21339 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21340 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
21341 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
21342 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
21343 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21344 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
21345 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
21346 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
21347 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21348 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
21349 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
21350 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
21351 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21352 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
21353 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
21354 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
21355 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
21356 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
21357 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
21358 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
21359 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
21360 // CHECK:   ret void
test_vst4q_lane_u32(uint32_t * a,uint32x4x4_t b)21361 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
21362   vst4q_lane_u32(a, b, 3);
21363 }
21364 
21365 // CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [8 x i64] %b.coerce) #0 {
21366 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
21367 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
21368 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
21369 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
21370 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21371 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
21372 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
21373 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21374 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21375 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21376 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
21377 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
21378 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
21379 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21380 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
21381 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
21382 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
21383 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21384 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
21385 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
21386 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
21387 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21388 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
21389 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
21390 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
21391 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21392 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21393 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21394 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21395 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21396 // CHECK:   ret void
test_vst4q_lane_s16(int16_t * a,int16x8x4_t b)21397 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
21398   vst4q_lane_s16(a, b, 7);
21399 }
21400 
21401 // CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [8 x i64] %b.coerce) #0 {
21402 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
21403 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
21404 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
21405 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
21406 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21407 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
21408 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
21409 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21410 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21411 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21412 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
21413 // CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
21414 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
21415 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21416 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
21417 // CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
21418 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
21419 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21420 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
21421 // CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
21422 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
21423 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21424 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
21425 // CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
21426 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
21427 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
21428 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
21429 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
21430 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
21431 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
21432 // CHECK:   ret void
test_vst4q_lane_s32(int32_t * a,int32x4x4_t b)21433 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
21434   vst4q_lane_s32(a, b, 3);
21435 }
21436 
21437 // CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [8 x i64] %b.coerce) #0 {
21438 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
21439 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
21440 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
21441 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
21442 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21443 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
21444 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
21445 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21446 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
21447 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21448 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
21449 // CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
21450 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
21451 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21452 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
21453 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
21454 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
21455 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21456 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
21457 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
21458 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
21459 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21460 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
21461 // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
21462 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
21463 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21464 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21465 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21466 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21467 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21468 // CHECK:   ret void
test_vst4q_lane_f16(float16_t * a,float16x8x4_t b)21469 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
21470   vst4q_lane_f16(a, b, 7);
21471 }
21472 
21473 // CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [8 x i64] %b.coerce) #0 {
21474 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
21475 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
21476 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
21477 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
21478 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21479 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
21480 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
21481 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21482 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
21483 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21484 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
21485 // CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
21486 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
21487 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21488 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
21489 // CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
21490 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
21491 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21492 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
21493 // CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
21494 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
21495 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21496 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
21497 // CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
21498 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
21499 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
21500 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
21501 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
21502 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
21503 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
21504 // CHECK:   ret void
test_vst4q_lane_f32(float32_t * a,float32x4x4_t b)21505 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
21506   vst4q_lane_f32(a, b, 3);
21507 }
21508 
21509 // CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [8 x i64] %b.coerce) #0 {
21510 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
21511 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
21512 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
21513 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
21514 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21515 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
21516 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
21517 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21518 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21519 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21520 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
21521 // CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
21522 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
21523 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21524 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
21525 // CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
21526 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
21527 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21528 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
21529 // CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
21530 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
21531 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21532 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
21533 // CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
21534 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
21535 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21536 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21537 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21538 // CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21539 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21540 // CHECK:   ret void
test_vst4q_lane_p16(poly16_t * a,poly16x8x4_t b)21541 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
21542   vst4q_lane_p16(a, b, 7);
21543 }
21544 
21545 // CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x i64] %b.coerce) #0 {
21546 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
21547 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
21548 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
21549 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21550 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21551 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
21552 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
21553 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21554 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21555 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21556 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21557 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21558 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21559 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21560 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21561 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21562 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21563 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21564 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21565 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21566 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
21567 // CHECK:   ret void
test_vst4_lane_u8(uint8_t * a,uint8x8x4_t b)21568 void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
21569   vst4_lane_u8(a, b, 7);
21570 }
21571 
21572 // CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
21573 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
21574 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
21575 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
21576 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21577 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21578 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
21579 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
21580 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21581 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21582 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21583 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21584 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21585 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21586 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21587 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21588 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21589 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21590 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21591 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21592 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21593 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21594 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21595 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21596 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21597 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21598 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21599 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21600 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21601 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21602 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21603 // CHECK:   ret void
test_vst4_lane_u16(uint16_t * a,uint16x4x4_t b)21604 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
21605   vst4_lane_u16(a, b, 3);
21606 }
21607 
21608 // CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
21609 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
21610 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
21611 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
21612 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
21613 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21614 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
21615 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
21616 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21617 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21618 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21619 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
21620 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
21621 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
21622 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21623 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
21624 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
21625 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
21626 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21627 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
21628 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
21629 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
21630 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21631 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
21632 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
21633 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
21634 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
21635 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
21636 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
21637 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
21638 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
21639 // CHECK:   ret void
test_vst4_lane_u32(uint32_t * a,uint32x2x4_t b)21640 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
21641   vst4_lane_u32(a, b, 1);
21642 }
21643 
21644 // CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x i64] %b.coerce) #0 {
21645 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
21646 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
21647 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
21648 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21649 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21650 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
21651 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
21652 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21653 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21654 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21655 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21656 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21657 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21658 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21659 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21660 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21661 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21662 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21663 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21664 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21665 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
21666 // CHECK:   ret void
test_vst4_lane_s8(int8_t * a,int8x8x4_t b)21667 void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
21668   vst4_lane_s8(a, b, 7);
21669 }
21670 
21671 // CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
21672 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
21673 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
21674 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
21675 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21676 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21677 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
21678 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
21679 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21680 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21681 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21682 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21683 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21684 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21685 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21686 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21687 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21688 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21689 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21690 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21691 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21692 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21693 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21694 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21695 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21696 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21697 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21698 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21699 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21700 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21701 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21702 // CHECK:   ret void
test_vst4_lane_s16(int16_t * a,int16x4x4_t b)21703 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
21704   vst4_lane_s16(a, b, 3);
21705 }
21706 
21707 // CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
21708 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
21709 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
21710 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
21711 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
21712 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21713 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
21714 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
21715 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21716 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21717 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21718 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
21719 // CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
21720 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
21721 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21722 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
21723 // CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
21724 // CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
21725 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21726 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
21727 // CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
21728 // CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
21729 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21730 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
21731 // CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
21732 // CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
21733 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
21734 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
21735 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
21736 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
21737 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
21738 // CHECK:   ret void
test_vst4_lane_s32(int32_t * a,int32x2x4_t b)21739 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
21740   vst4_lane_s32(a, b, 1);
21741 }
21742 
21743 // CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
21744 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
21745 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
21746 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
21747 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
21748 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21749 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
21750 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
21751 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21752 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
21753 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21754 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
21755 // CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
21756 // CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
21757 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21758 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
21759 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
21760 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
21761 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21762 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
21763 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
21764 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
21765 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21766 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
21767 // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
21768 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
21769 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21770 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21771 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21772 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21773 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21774 // CHECK:   ret void
test_vst4_lane_f16(float16_t * a,float16x4x4_t b)21775 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
21776   vst4_lane_f16(a, b, 3);
21777 }
21778 
21779 // CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
21780 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
21781 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
21782 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
21783 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
21784 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21785 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
21786 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
21787 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21788 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
21789 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21790 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
21791 // CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
21792 // CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
21793 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21794 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
21795 // CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
21796 // CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
21797 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21798 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
21799 // CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
21800 // CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
21801 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21802 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
21803 // CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
21804 // CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
21805 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
21806 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
21807 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
21808 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
21809 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
21810 // CHECK:   ret void
test_vst4_lane_f32(float32_t * a,float32x2x4_t b)21811 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
21812   vst4_lane_f32(a, b, 1);
21813 }
21814 
21815 // CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x i64] %b.coerce) #0 {
21816 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
21817 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
21818 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
21819 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21820 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21821 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
21822 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
21823 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21824 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21825 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21826 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21827 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21828 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21829 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21830 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21831 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21832 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21833 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21834 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21835 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21836 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
21837 // CHECK:   ret void
test_vst4_lane_p8(poly8_t * a,poly8x8x4_t b)21838 void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
21839   vst4_lane_p8(a, b, 7);
21840 }
21841 
21842 // CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
21843 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
21844 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
21845 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
21846 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21847 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21848 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
21849 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
21850 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21851 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21852 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21853 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21854 // CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21855 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21856 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21857 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21858 // CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21859 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21860 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21861 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21862 // CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21863 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21864 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21865 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21866 // CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21867 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21868 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21869 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21870 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21871 // CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21872 // CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21873 // CHECK:   ret void
test_vst4_lane_p16(poly16_t * a,poly16x4x4_t b)21874 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
21875   vst4_lane_p16(a, b, 3);
21876 }
21877 
21878 
21879 // CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
21880 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
21881 // CHECK:   ret <8 x i8> [[SUB_I]]
test_vsub_s8(int8x8_t a,int8x8_t b)21882 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
21883   return vsub_s8(a, b);
21884 }
21885 
21886 // CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
21887 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
21888 // CHECK:   ret <4 x i16> [[SUB_I]]
test_vsub_s16(int16x4_t a,int16x4_t b)21889 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
21890   return vsub_s16(a, b);
21891 }
21892 
21893 // CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
21894 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
21895 // CHECK:   ret <2 x i32> [[SUB_I]]
test_vsub_s32(int32x2_t a,int32x2_t b)21896 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
21897   return vsub_s32(a, b);
21898 }
21899 
21900 // CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
21901 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
21902 // CHECK:   ret <1 x i64> [[SUB_I]]
test_vsub_s64(int64x1_t a,int64x1_t b)21903 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
21904   return vsub_s64(a, b);
21905 }
21906 
21907 // CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %a, <2 x float> %b) #0 {
21908 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
21909 // CHECK:   ret <2 x float> [[SUB_I]]
test_vsub_f32(float32x2_t a,float32x2_t b)21910 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
21911   return vsub_f32(a, b);
21912 }
21913 
21914 // CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
21915 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
21916 // CHECK:   ret <8 x i8> [[SUB_I]]
test_vsub_u8(uint8x8_t a,uint8x8_t b)21917 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
21918   return vsub_u8(a, b);
21919 }
21920 
21921 // CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
21922 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
21923 // CHECK:   ret <4 x i16> [[SUB_I]]
test_vsub_u16(uint16x4_t a,uint16x4_t b)21924 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
21925   return vsub_u16(a, b);
21926 }
21927 
21928 // CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
21929 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
21930 // CHECK:   ret <2 x i32> [[SUB_I]]
test_vsub_u32(uint32x2_t a,uint32x2_t b)21931 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
21932   return vsub_u32(a, b);
21933 }
21934 
21935 // CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
21936 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
21937 // CHECK:   ret <1 x i64> [[SUB_I]]
test_vsub_u64(uint64x1_t a,uint64x1_t b)21938 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
21939   return vsub_u64(a, b);
21940 }
21941 
21942 // CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
21943 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
21944 // CHECK:   ret <16 x i8> [[SUB_I]]
test_vsubq_s8(int8x16_t a,int8x16_t b)21945 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
21946   return vsubq_s8(a, b);
21947 }
21948 
21949 // CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
21950 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
21951 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vsubq_s16(int16x8_t a,int16x8_t b)21952 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
21953   return vsubq_s16(a, b);
21954 }
21955 
21956 // CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
21957 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
21958 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vsubq_s32(int32x4_t a,int32x4_t b)21959 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
21960   return vsubq_s32(a, b);
21961 }
21962 
21963 // CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
21964 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
21965 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vsubq_s64(int64x2_t a,int64x2_t b)21966 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
21967   return vsubq_s64(a, b);
21968 }
21969 
21970 // CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 {
21971 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
21972 // CHECK:   ret <4 x float> [[SUB_I]]
test_vsubq_f32(float32x4_t a,float32x4_t b)21973 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
21974   return vsubq_f32(a, b);
21975 }
21976 
21977 // CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
21978 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
21979 // CHECK:   ret <16 x i8> [[SUB_I]]
test_vsubq_u8(uint8x16_t a,uint8x16_t b)21980 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
21981   return vsubq_u8(a, b);
21982 }
21983 
21984 // CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
21985 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
21986 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vsubq_u16(uint16x8_t a,uint16x8_t b)21987 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
21988   return vsubq_u16(a, b);
21989 }
21990 
21991 // CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
21992 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
21993 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vsubq_u32(uint32x4_t a,uint32x4_t b)21994 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
21995   return vsubq_u32(a, b);
21996 }
21997 
21998 // CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
21999 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
22000 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vsubq_u64(uint64x2_t a,uint64x2_t b)22001 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
22002   return vsubq_u64(a, b);
22003 }
22004 
22005 
22006 // CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
22007 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
22008 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
22009 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
22010 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
22011 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
22012 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
22013 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
22014 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
test_vsubhn_s16(int16x8_t a,int16x8_t b)22015 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
22016   return vsubhn_s16(a, b);
22017 }
22018 
22019 // CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
22020 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
22021 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
22022 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
22023 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
22024 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
22025 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
22026 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
22027 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
test_vsubhn_s32(int32x4_t a,int32x4_t b)22028 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
22029   return vsubhn_s32(a, b);
22030 }
22031 
22032 // CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
22033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
22034 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
22035 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
22036 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
22037 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
22038 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
22039 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
22040 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
test_vsubhn_s64(int64x2_t a,int64x2_t b)22041 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
22042   return vsubhn_s64(a, b);
22043 }
22044 
22045 // CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
22046 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
22047 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
22048 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
22049 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
22050 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
22051 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
22052 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
22053 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
test_vsubhn_u16(uint16x8_t a,uint16x8_t b)22054 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
22055   return vsubhn_u16(a, b);
22056 }
22057 
22058 // CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
22059 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
22060 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
22061 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
22062 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
22063 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
22064 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
22065 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
22066 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
test_vsubhn_u32(uint32x4_t a,uint32x4_t b)22067 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
22068   return vsubhn_u32(a, b);
22069 }
22070 
22071 // CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
22072 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
22073 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
22074 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
22075 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
22076 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
22077 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
22078 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
22079 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
test_vsubhn_u64(uint64x2_t a,uint64x2_t b)22080 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
22081   return vsubhn_u64(a, b);
22082 }
22083 
22084 
22085 // CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
22086 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
22087 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
22088 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22089 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vsubl_s8(int8x8_t a,int8x8_t b)22090 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
22091   return vsubl_s8(a, b);
22092 }
22093 
22094 // CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
22095 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22096 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22097 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
22098 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22099 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22100 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
22101 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22102 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vsubl_s16(int16x4_t a,int16x4_t b)22103 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
22104   return vsubl_s16(a, b);
22105 }
22106 
22107 // CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
22108 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22109 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22110 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
22111 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22112 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22113 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
22114 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22115 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vsubl_s32(int32x2_t a,int32x2_t b)22116 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
22117   return vsubl_s32(a, b);
22118 }
22119 
22120 // CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
22121 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
22122 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
22123 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22124 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vsubl_u8(uint8x8_t a,uint8x8_t b)22125 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
22126   return vsubl_u8(a, b);
22127 }
22128 
22129 // CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
22130 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22131 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22132 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
22133 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22134 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22135 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
22136 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22137 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vsubl_u16(uint16x4_t a,uint16x4_t b)22138 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
22139   return vsubl_u16(a, b);
22140 }
22141 
22142 // CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
22143 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22144 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22145 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
22146 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22147 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22148 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
22149 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22150 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vsubl_u32(uint32x2_t a,uint32x2_t b)22151 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
22152   return vsubl_u32(a, b);
22153 }
22154 
22155 
22156 // CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
22157 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
22158 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
22159 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vsubw_s8(int16x8_t a,int8x8_t b)22160 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
22161   return vsubw_s8(a, b);
22162 }
22163 
22164 // CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
22165 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22166 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22167 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
22168 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
22169 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vsubw_s16(int32x4_t a,int16x4_t b)22170 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
22171   return vsubw_s16(a, b);
22172 }
22173 
22174 // CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
22175 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22176 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22177 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
22178 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
22179 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vsubw_s32(int64x2_t a,int32x2_t b)22180 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
22181   return vsubw_s32(a, b);
22182 }
22183 
22184 // CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
22185 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
22186 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
22187 // CHECK:   ret <8 x i16> [[SUB_I]]
test_vsubw_u8(uint16x8_t a,uint8x8_t b)22188 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
22189   return vsubw_u8(a, b);
22190 }
22191 
22192 // CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
22193 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22194 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22195 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
22196 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
22197 // CHECK:   ret <4 x i32> [[SUB_I]]
test_vsubw_u16(uint32x4_t a,uint16x4_t b)22198 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
22199   return vsubw_u16(a, b);
22200 }
22201 
22202 // CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
22203 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22204 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22205 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
22206 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
22207 // CHECK:   ret <2 x i64> [[SUB_I]]
test_vsubw_u32(uint64x2_t a,uint32x2_t b)22208 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
22209   return vsubw_u32(a, b);
22210 }
22211 
22212 
22213 // CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
22214 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
22215 // CHECK:   ret <8 x i8> [[VTBL1_I]]
test_vtbl1_u8(uint8x8_t a,uint8x8_t b)22216 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
22217   return vtbl1_u8(a, b);
22218 }
22219 
22220 // CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
22221 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
22222 // CHECK:   ret <8 x i8> [[VTBL1_I]]
test_vtbl1_s8(int8x8_t a,int8x8_t b)22223 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
22224   return vtbl1_s8(a, b);
22225 }
22226 
22227 // CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
22228 // CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
22229 // CHECK:   ret <8 x i8> [[VTBL1_I]]
test_vtbl1_p8(poly8x8_t a,uint8x8_t b)22230 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
22231   return vtbl1_p8(a, b);
22232 }
22233 
22234 
22235 // CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
22236 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
22237 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
22238 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
22239 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22240 // CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
22241 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
22242 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22243 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22244 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
22245 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22246 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22247 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
22248 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22249 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22250 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
22251 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22252 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22253 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
22254 // CHECK:   ret <8 x i8> [[VTBL2_I]]
test_vtbl2_u8(uint8x8x2_t a,uint8x8_t b)22255 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
22256   return vtbl2_u8(a, b);
22257 }
22258 
22259 // CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
22260 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
22261 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
22262 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
22263 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22264 // CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
22265 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
22266 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22267 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22268 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
22269 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22270 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22271 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
22272 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22273 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22274 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
22275 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22276 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22277 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
22278 // CHECK:   ret <8 x i8> [[VTBL2_I]]
test_vtbl2_s8(int8x8x2_t a,int8x8_t b)22279 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
22280   return vtbl2_s8(a, b);
22281 }
22282 
22283 // CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
22284 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
22285 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
22286 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
22287 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22288 // CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
22289 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
22290 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22291 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22292 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
22293 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22294 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22295 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
22296 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22297 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22298 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
22299 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22300 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22301 // CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
22302 // CHECK:   ret <8 x i8> [[VTBL2_I]]
test_vtbl2_p8(poly8x8x2_t a,uint8x8_t b)22303 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
22304   return vtbl2_p8(a, b);
22305 }
22306 
22307 
22308 // CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
22309 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
22310 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
22311 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
22312 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22313 // CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
22314 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
22315 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22316 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22317 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22318 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22319 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22320 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22321 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22322 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22323 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22324 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22325 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22326 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22327 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22328 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22329 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
22330 // CHECK:   ret <8 x i8> [[VTBL3_I]]
test_vtbl3_u8(uint8x8x3_t a,uint8x8_t b)22331 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
22332   return vtbl3_u8(a, b);
22333 }
22334 
22335 // CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
22336 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
22337 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
22338 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
22339 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22340 // CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
22341 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
22342 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22343 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22344 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22345 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22346 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22347 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22348 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22349 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22350 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22351 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22352 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22353 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22354 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22355 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22356 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
22357 // CHECK:   ret <8 x i8> [[VTBL3_I]]
test_vtbl3_s8(int8x8x3_t a,int8x8_t b)22358 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
22359   return vtbl3_s8(a, b);
22360 }
22361 
22362 // CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
22363 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
22364 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
22365 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
22366 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22367 // CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
22368 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
22369 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22370 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22371 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22372 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22373 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22374 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22375 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22376 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22377 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22378 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22379 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22380 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22381 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22382 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22383 // CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
22384 // CHECK:   ret <8 x i8> [[VTBL3_I]]
test_vtbl3_p8(poly8x8x3_t a,uint8x8_t b)22385 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
22386   return vtbl3_p8(a, b);
22387 }
22388 
22389 
22390 // CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
22391 // CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
22392 // CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
22393 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
22394 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22395 // CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
22396 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
22397 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22398 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22399 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22400 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22401 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22402 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22403 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22404 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22405 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22406 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22407 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22408 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22409 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22410 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22411 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22412 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22413 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22414 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
22415 // CHECK:   ret <8 x i8> [[VTBL4_I]]
test_vtbl4_u8(uint8x8x4_t a,uint8x8_t b)22416 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
22417   return vtbl4_u8(a, b);
22418 }
22419 
22420 // CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
22421 // CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
22422 // CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
22423 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
22424 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22425 // CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
22426 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
22427 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22428 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22429 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22430 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22431 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22432 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22433 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22434 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22435 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22436 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22437 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22438 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22439 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22440 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22441 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22442 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22443 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22444 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
22445 // CHECK:   ret <8 x i8> [[VTBL4_I]]
test_vtbl4_s8(int8x8x4_t a,int8x8_t b)22446 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
22447   return vtbl4_s8(a, b);
22448 }
22449 
22450 // CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
22451 // CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
22452 // CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
22453 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
22454 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22455 // CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
22456 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
22457 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22458 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22459 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22460 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22461 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22462 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22463 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22464 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22465 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22466 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22467 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22468 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22469 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22470 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22471 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22472 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22473 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22474 // CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
22475 // CHECK:   ret <8 x i8> [[VTBL4_I]]
test_vtbl4_p8(poly8x8x4_t a,uint8x8_t b)22476 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
22477   return vtbl4_p8(a, b);
22478 }
22479 
22480 
22481 // CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
22482 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
22483 // CHECK:   ret <8 x i8> [[VTBX1_I]]
test_vtbx1_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)22484 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
22485   return vtbx1_u8(a, b, c);
22486 }
22487 
22488 // CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
22489 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
22490 // CHECK:   ret <8 x i8> [[VTBX1_I]]
test_vtbx1_s8(int8x8_t a,int8x8_t b,int8x8_t c)22491 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
22492   return vtbx1_s8(a, b, c);
22493 }
22494 
22495 // CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
22496 // CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
22497 // CHECK:   ret <8 x i8> [[VTBX1_I]]
test_vtbx1_p8(poly8x8_t a,poly8x8_t b,uint8x8_t c)22498 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
22499   return vtbx1_p8(a, b, c);
22500 }
22501 
22502 
22503 // CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
22504 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
22505 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
22506 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
22507 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22508 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
22509 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
22510 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22511 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22512 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
22513 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22514 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22515 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
22516 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22517 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22518 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
22519 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22520 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22521 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
22522 // CHECK:   ret <8 x i8> [[VTBX2_I]]
test_vtbx2_u8(uint8x8_t a,uint8x8x2_t b,uint8x8_t c)22523 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
22524   return vtbx2_u8(a, b, c);
22525 }
22526 
22527 // CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
22528 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
22529 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
22530 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
22531 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22532 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
22533 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
22534 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22535 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22536 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
22537 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22538 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22539 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
22540 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22541 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22542 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
22543 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22544 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22545 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
22546 // CHECK:   ret <8 x i8> [[VTBX2_I]]
test_vtbx2_s8(int8x8_t a,int8x8x2_t b,int8x8_t c)22547 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
22548   return vtbx2_s8(a, b, c);
22549 }
22550 
22551 // CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
22552 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
22553 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
22554 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
22555 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22556 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
22557 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
22558 // CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22559 // CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22560 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
22561 // CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22562 // CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22563 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
22564 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22565 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22566 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
22567 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22568 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22569 // CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
22570 // CHECK:   ret <8 x i8> [[VTBX2_I]]
test_vtbx2_p8(poly8x8_t a,poly8x8x2_t b,uint8x8_t c)22571 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
22572   return vtbx2_p8(a, b, c);
22573 }
22574 
22575 
22576 // CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
22577 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
22578 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
22579 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
22580 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22581 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
22582 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
22583 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22584 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22585 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22586 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22587 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22588 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22589 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22590 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22591 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22592 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22593 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22594 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22595 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22596 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22597 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
22598 // CHECK:   ret <8 x i8> [[VTBX3_I]]
test_vtbx3_u8(uint8x8_t a,uint8x8x3_t b,uint8x8_t c)22599 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
22600   return vtbx3_u8(a, b, c);
22601 }
22602 
22603 // CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
22604 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
22605 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
22606 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
22607 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22608 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
22609 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
22610 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22611 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22612 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22613 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22614 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22615 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22616 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22617 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22618 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22619 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22620 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22621 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22622 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22623 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22624 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
22625 // CHECK:   ret <8 x i8> [[VTBX3_I]]
test_vtbx3_s8(int8x8_t a,int8x8x3_t b,int8x8_t c)22626 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
22627   return vtbx3_s8(a, b, c);
22628 }
22629 
22630 // CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
22631 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
22632 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
22633 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
22634 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22635 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
22636 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
22637 // CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22638 // CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22639 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22640 // CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22641 // CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22642 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22643 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22644 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22645 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22646 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22647 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22648 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22649 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22650 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22651 // CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
22652 // CHECK:   ret <8 x i8> [[VTBX3_I]]
test_vtbx3_p8(poly8x8_t a,poly8x8x3_t b,uint8x8_t c)22653 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
22654   return vtbx3_p8(a, b, c);
22655 }
22656 
22657 
22658 // CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
22659 // CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
22660 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
22661 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
22662 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22663 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
22664 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
22665 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22666 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22667 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22668 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22669 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22670 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22671 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22672 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22673 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22674 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22675 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22676 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22677 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22678 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22679 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22680 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22681 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22682 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
22683 // CHECK:   ret <8 x i8> [[VTBX4_I]]
test_vtbx4_u8(uint8x8_t a,uint8x8x4_t b,uint8x8_t c)22684 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
22685   return vtbx4_u8(a, b, c);
22686 }
22687 
22688 // CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
22689 // CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
22690 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
22691 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
22692 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22693 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
22694 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
22695 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22696 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22697 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22698 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22699 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22700 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22701 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22702 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22703 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22704 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22705 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22706 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22707 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22708 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22709 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22710 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22711 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22712 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
22713 // CHECK:   ret <8 x i8> [[VTBX4_I]]
test_vtbx4_s8(int8x8_t a,int8x8x4_t b,int8x8_t c)22714 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
22715   return vtbx4_s8(a, b, c);
22716 }
22717 
22718 // CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
22719 // CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
22720 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
22721 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
22722 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22723 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
22724 // CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
22725 // CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22726 // CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22727 // CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22728 // CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22729 // CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22730 // CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22731 // CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22732 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22733 // CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22734 // CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22735 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22736 // CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22737 // CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22738 // CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22739 // CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22740 // CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22741 // CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22742 // CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
22743 // CHECK:   ret <8 x i8> [[VTBX4_I]]
test_vtbx4_p8(poly8x8_t a,poly8x8x4_t b,uint8x8_t c)22744 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
22745   return vtbx4_p8(a, b, c);
22746 }
22747 
22748 
22749 // CHECK-LABEL: define void @test_vtrn_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
22750 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
22751 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
22752 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
22753 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22754 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
22755 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
22756 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22757 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
22758 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
22759 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
22760 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
22761 // CHECK:   ret void
test_vtrn_s8(int8x8_t a,int8x8_t b)22762 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
22763   return vtrn_s8(a, b);
22764 }
22765 
22766 // CHECK-LABEL: define void @test_vtrn_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
22767 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
22768 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
22769 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22770 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22771 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
22772 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22773 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22774 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22775 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
22776 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
22777 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22778 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
22779 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
22780 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
22781 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22782 // CHECK:   ret void
test_vtrn_s16(int16x4_t a,int16x4_t b)22783 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
22784   return vtrn_s16(a, b);
22785 }
22786 
22787 // CHECK-LABEL: define void @test_vtrn_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
22788 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
22789 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
22790 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22791 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22792 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
22793 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
22794 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22795 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
22796 // CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
22797 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
22798 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
22799 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
22800 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
22801 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
22802 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22803 // CHECK:   ret void
test_vtrn_s32(int32x2_t a,int32x2_t b)22804 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
22805   return vtrn_s32(a, b);
22806 }
22807 
22808 // CHECK-LABEL: define void @test_vtrn_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
22809 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
22810 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
22811 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
22812 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22813 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
22814 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
22815 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22816 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
22817 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
22818 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
22819 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
22820 // CHECK:   ret void
test_vtrn_u8(uint8x8_t a,uint8x8_t b)22821 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
22822   return vtrn_u8(a, b);
22823 }
22824 
22825 // CHECK-LABEL: define void @test_vtrn_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
22826 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
22827 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
22828 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22829 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22830 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
22831 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22832 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22833 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22834 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
22835 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
22836 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22837 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
22838 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
22839 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
22840 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22841 // CHECK:   ret void
test_vtrn_u16(uint16x4_t a,uint16x4_t b)22842 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
22843   return vtrn_u16(a, b);
22844 }
22845 
22846 // CHECK-LABEL: define void @test_vtrn_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
22847 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
22848 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
22849 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22850 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22851 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
22852 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
22853 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22854 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
22855 // CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
22856 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
22857 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
22858 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
22859 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
22860 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
22861 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22862 // CHECK:   ret void
test_vtrn_u32(uint32x2_t a,uint32x2_t b)22863 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
22864   return vtrn_u32(a, b);
22865 }
22866 
22867 // CHECK-LABEL: define void @test_vtrn_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
22868 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
22869 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
22870 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
22871 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
22872 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
22873 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
22874 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
22875 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
22876 // CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
22877 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
22878 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
22879 // CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
22880 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
22881 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
22882 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22883 // CHECK:   ret void
test_vtrn_f32(float32x2_t a,float32x2_t b)22884 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
22885   return vtrn_f32(a, b);
22886 }
22887 
22888 // CHECK-LABEL: define void @test_vtrn_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
22889 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
22890 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
22891 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
22892 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22893 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
22894 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
22895 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22896 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
22897 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
22898 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
22899 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
22900 // CHECK:   ret void
test_vtrn_p8(poly8x8_t a,poly8x8_t b)22901 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
22902   return vtrn_p8(a, b);
22903 }
22904 
22905 // CHECK-LABEL: define void @test_vtrn_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
22906 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
22907 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
22908 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22909 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22910 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
22911 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22912 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22913 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22914 // CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
22915 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
22916 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22917 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
22918 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
22919 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
22920 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22921 // CHECK:   ret void
test_vtrn_p16(poly16x4_t a,poly16x4_t b)22922 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
22923   return vtrn_p16(a, b);
22924 }
22925 
22926 // CHECK-LABEL: define void @test_vtrnq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
22927 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
22928 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
22929 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
22930 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
22931 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
22932 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
22933 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
22934 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
22935 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
22936 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
22937 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
22938 // CHECK:   ret void
test_vtrnq_s8(int8x16_t a,int8x16_t b)22939 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
22940   return vtrnq_s8(a, b);
22941 }
22942 
22943 // CHECK-LABEL: define void @test_vtrnq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
22944 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
22945 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
22946 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
22947 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
22948 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
22949 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
22950 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
22951 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22952 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
22953 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
22954 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22955 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
22956 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
22957 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
22958 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
22959 // CHECK:   ret void
test_vtrnq_s16(int16x8_t a,int16x8_t b)22960 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
22961   return vtrnq_s16(a, b);
22962 }
22963 
22964 // CHECK-LABEL: define void @test_vtrnq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
22965 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
22966 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
22967 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
22968 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
22969 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
22970 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
22971 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
22972 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22973 // CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
22974 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
22975 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22976 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
22977 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
22978 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
22979 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
22980 // CHECK:   ret void
test_vtrnq_s32(int32x4_t a,int32x4_t b)22981 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
22982   return vtrnq_s32(a, b);
22983 }
22984 
22985 // CHECK-LABEL: define void @test_vtrnq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
22986 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
22987 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
22988 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
22989 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
22990 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
22991 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
22992 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
22993 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
22994 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
22995 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
22996 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
22997 // CHECK:   ret void
test_vtrnq_u8(uint8x16_t a,uint8x16_t b)22998 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
22999   return vtrnq_u8(a, b);
23000 }
23001 
23002 // CHECK-LABEL: define void @test_vtrnq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23003 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
23004 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23005 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23006 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23007 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23008 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23009 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23010 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
23011 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
23012 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23013 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
23014 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
23015 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
23016 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23017 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23018 // CHECK:   ret void
test_vtrnq_u16(uint16x8_t a,uint16x8_t b)23019 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
23020   return vtrnq_u16(a, b);
23021 }
23022 
23023 // CHECK-LABEL: define void @test_vtrnq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23024 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
23025 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23026 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23027 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23028 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23029 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23030 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23031 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
23032 // CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
23033 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23034 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
23035 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
23036 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
23037 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23038 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23039 // CHECK:   ret void
test_vtrnq_u32(uint32x4_t a,uint32x4_t b)23040 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
23041   return vtrnq_u32(a, b);
23042 }
23043 
23044 // CHECK-LABEL: define void @test_vtrnq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
23045 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
23046 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23047 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
23048 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
23049 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
23050 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
23051 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
23052 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
23053 // CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
23054 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
23055 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
23056 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
23057 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
23058 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23059 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23060 // CHECK:   ret void
test_vtrnq_f32(float32x4_t a,float32x4_t b)23061 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
23062   return vtrnq_f32(a, b);
23063 }
23064 
23065 // CHECK-LABEL: define void @test_vtrnq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23066 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
23067 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23068 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23069 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
23070 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
23071 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23072 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
23073 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
23074 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
23075 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23076 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23077 // CHECK:   ret void
test_vtrnq_p8(poly8x16_t a,poly8x16_t b)23078 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
23079   return vtrnq_p8(a, b);
23080 }
23081 
23082 // CHECK-LABEL: define void @test_vtrnq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23083 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
23084 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23085 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23086 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23087 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23088 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23089 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23090 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
23091 // CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
23092 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23093 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
23094 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
23095 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
23096 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23097 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23098 // CHECK:   ret void
test_vtrnq_p16(poly16x8_t a,poly16x8_t b)23099 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
23100   return vtrnq_p16(a, b);
23101 }
23102 
23103 
23104 // CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %a, <8 x i8> %b) #0 {
23105 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
23106 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
23107 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
23108 // CHECK:   ret <8 x i8> [[VTST_I]]
test_vtst_s8(int8x8_t a,int8x8_t b)23109 uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
23110   return vtst_s8(a, b);
23111 }
23112 
23113 // CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %a, <4 x i16> %b) #0 {
23114 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23115 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23116 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
23117 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23118 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
23119 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
23120 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
23121 // CHECK:   ret <4 x i16> [[VTST_I]]
test_vtst_s16(int16x4_t a,int16x4_t b)23122 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
23123   return vtst_s16(a, b);
23124 }
23125 
23126 // CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %a, <2 x i32> %b) #0 {
23127 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23128 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23129 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
23130 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23131 // CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
23132 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
23133 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
23134 // CHECK:   ret <2 x i32> [[VTST_I]]
test_vtst_s32(int32x2_t a,int32x2_t b)23135 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
23136   return vtst_s32(a, b);
23137 }
23138 
23139 // CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %a, <8 x i8> %b) #0 {
23140 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
23141 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
23142 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
23143 // CHECK:   ret <8 x i8> [[VTST_I]]
test_vtst_u8(uint8x8_t a,uint8x8_t b)23144 uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
23145   return vtst_u8(a, b);
23146 }
23147 
23148 // CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %a, <4 x i16> %b) #0 {
23149 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23150 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23151 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
23152 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23153 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
23154 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
23155 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
23156 // CHECK:   ret <4 x i16> [[VTST_I]]
test_vtst_u16(uint16x4_t a,uint16x4_t b)23157 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
23158   return vtst_u16(a, b);
23159 }
23160 
23161 // CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %a, <2 x i32> %b) #0 {
23162 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23163 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23164 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
23165 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23166 // CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
23167 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
23168 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
23169 // CHECK:   ret <2 x i32> [[VTST_I]]
test_vtst_u32(uint32x2_t a,uint32x2_t b)23170 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
23171   return vtst_u32(a, b);
23172 }
23173 
23174 // CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %a, <8 x i8> %b) #0 {
23175 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
23176 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
23177 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
23178 // CHECK:   ret <8 x i8> [[VTST_I]]
test_vtst_p8(poly8x8_t a,poly8x8_t b)23179 uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
23180   return vtst_p8(a, b);
23181 }
23182 
23183 // CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %a, <4 x i16> %b) #0 {
23184 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23185 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23186 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
23187 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23188 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
23189 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
23190 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
23191 // CHECK:   ret <4 x i16> [[VTST_I]]
test_vtst_p16(poly16x4_t a,poly16x4_t b)23192 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
23193   return vtst_p16(a, b);
23194 }
23195 
23196 // CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
23197 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
23198 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
23199 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
23200 // CHECK:   ret <16 x i8> [[VTST_I]]
test_vtstq_s8(int8x16_t a,int8x16_t b)23201 uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
23202   return vtstq_s8(a, b);
23203 }
23204 
23205 // CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
23206 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23207 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23208 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
23209 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23210 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
23211 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
23212 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
23213 // CHECK:   ret <8 x i16> [[VTST_I]]
test_vtstq_s16(int16x8_t a,int16x8_t b)23214 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
23215   return vtstq_s16(a, b);
23216 }
23217 
23218 // CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
23219 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23220 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23221 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
23222 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23223 // CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
23224 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
23225 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
23226 // CHECK:   ret <4 x i32> [[VTST_I]]
test_vtstq_s32(int32x4_t a,int32x4_t b)23227 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
23228   return vtstq_s32(a, b);
23229 }
23230 
23231 // CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
23232 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
23233 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
23234 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
23235 // CHECK:   ret <16 x i8> [[VTST_I]]
test_vtstq_u8(uint8x16_t a,uint8x16_t b)23236 uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
23237   return vtstq_u8(a, b);
23238 }
23239 
23240 // CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
23241 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23242 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23243 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
23244 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23245 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
23246 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
23247 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
23248 // CHECK:   ret <8 x i16> [[VTST_I]]
test_vtstq_u16(uint16x8_t a,uint16x8_t b)23249 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
23250   return vtstq_u16(a, b);
23251 }
23252 
23253 // CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
23254 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23255 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23256 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
23257 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23258 // CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
23259 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
23260 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
23261 // CHECK:   ret <4 x i32> [[VTST_I]]
test_vtstq_u32(uint32x4_t a,uint32x4_t b)23262 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
23263   return vtstq_u32(a, b);
23264 }
23265 
23266 // CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
23267 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
23268 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
23269 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
23270 // CHECK:   ret <16 x i8> [[VTST_I]]
test_vtstq_p8(poly8x16_t a,poly8x16_t b)23271 uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
23272   return vtstq_p8(a, b);
23273 }
23274 
23275 // CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
23276 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23277 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23278 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
23279 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23280 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
23281 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
23282 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
23283 // CHECK:   ret <8 x i16> [[VTST_I]]
test_vtstq_p16(poly16x8_t a,poly16x8_t b)23284 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
23285   return vtstq_p16(a, b);
23286 }
23287 
23288 
23289 // CHECK-LABEL: define void @test_vuzp_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23290 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
23291 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23292 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23293 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23294 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
23295 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23296 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23297 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
23298 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
23299 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23300 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23301 // CHECK:   ret void
test_vuzp_s8(int8x8_t a,int8x8_t b)23302 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
23303   return vuzp_s8(a, b);
23304 }
23305 
23306 // CHECK-LABEL: define void @test_vuzp_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23307 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
23308 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23309 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23310 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23311 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23312 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23313 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23314 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23315 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
23316 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23317 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23318 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
23319 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
23320 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23321 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23322 // CHECK:   ret void
test_vuzp_s16(int16x4_t a,int16x4_t b)23323 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
23324   return vuzp_s16(a, b);
23325 }
23326 
23327 // CHECK-LABEL: define void @test_vuzp_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23328 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
23329 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23330 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23331 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23332 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23333 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23334 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23335 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23336 // CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
23337 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23338 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23339 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
23340 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
23341 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23342 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23343 // CHECK:   ret void
test_vuzp_s32(int32x2_t a,int32x2_t b)23344 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
23345   return vuzp_s32(a, b);
23346 }
23347 
23348 // CHECK-LABEL: define void @test_vuzp_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23349 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
23350 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23351 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23352 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23353 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
23354 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23355 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23356 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
23357 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
23358 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23359 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23360 // CHECK:   ret void
test_vuzp_u8(uint8x8_t a,uint8x8_t b)23361 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
23362   return vuzp_u8(a, b);
23363 }
23364 
23365 // CHECK-LABEL: define void @test_vuzp_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23366 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
23367 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23368 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23369 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23370 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23371 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23372 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23373 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23374 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
23375 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23376 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23377 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
23378 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
23379 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23380 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23381 // CHECK:   ret void
test_vuzp_u16(uint16x4_t a,uint16x4_t b)23382 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
23383   return vuzp_u16(a, b);
23384 }
23385 
23386 // CHECK-LABEL: define void @test_vuzp_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23387 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
23388 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23389 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23390 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23391 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23392 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23393 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23394 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23395 // CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
23396 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23397 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23398 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
23399 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
23400 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23401 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23402 // CHECK:   ret void
test_vuzp_u32(uint32x2_t a,uint32x2_t b)23403 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
23404   return vuzp_u32(a, b);
23405 }
23406 
23407 // CHECK-LABEL: define void @test_vuzp_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
23408 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
23409 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23410 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
23411 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
23412 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
23413 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
23414 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
23415 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
23416 // CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
23417 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
23418 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
23419 // CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
23420 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
23421 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23422 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23423 // CHECK:   ret void
test_vuzp_f32(float32x2_t a,float32x2_t b)23424 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
23425   return vuzp_f32(a, b);
23426 }
23427 
23428 // CHECK-LABEL: define void @test_vuzp_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23429 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
23430 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23431 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23432 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23433 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
23434 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23435 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23436 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
23437 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
23438 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23439 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23440 // CHECK:   ret void
test_vuzp_p8(poly8x8_t a,poly8x8_t b)23441 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
23442   return vuzp_p8(a, b);
23443 }
23444 
23445 // CHECK-LABEL: define void @test_vuzp_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23446 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
23447 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23448 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23449 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23450 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23451 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23452 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23453 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23454 // CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
23455 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23456 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23457 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
23458 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
23459 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23460 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23461 // CHECK:   ret void
test_vuzp_p16(poly16x4_t a,poly16x4_t b)23462 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
23463   return vuzp_p16(a, b);
23464 }
23465 
23466 // CHECK-LABEL: define void @test_vuzpq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23467 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
23468 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23469 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23470 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
23471 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
23472 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23473 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
23474 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
23475 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
23476 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23477 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23478 // CHECK:   ret void
test_vuzpq_s8(int8x16_t a,int8x16_t b)23479 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
23480   return vuzpq_s8(a, b);
23481 }
23482 
23483 // CHECK-LABEL: define void @test_vuzpq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23484 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
23485 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23486 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23487 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23488 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23489 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23490 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23491 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23492 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
23493 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23494 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23495 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
23496 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
23497 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23498 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23499 // CHECK:   ret void
test_vuzpq_s16(int16x8_t a,int16x8_t b)23500 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
23501   return vuzpq_s16(a, b);
23502 }
23503 
23504 // CHECK-LABEL: define void @test_vuzpq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23505 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
23506 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23507 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23508 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23509 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23510 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23511 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23512 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23513 // CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
23514 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23515 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23516 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
23517 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
23518 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23519 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23520 // CHECK:   ret void
test_vuzpq_s32(int32x4_t a,int32x4_t b)23521 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
23522   return vuzpq_s32(a, b);
23523 }
23524 
23525 // CHECK-LABEL: define void @test_vuzpq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23526 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
23527 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23528 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23529 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
23530 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
23531 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23532 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
23533 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
23534 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
23535 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23536 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23537 // CHECK:   ret void
test_vuzpq_u8(uint8x16_t a,uint8x16_t b)23538 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
23539   return vuzpq_u8(a, b);
23540 }
23541 
23542 // CHECK-LABEL: define void @test_vuzpq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23543 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
23544 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23545 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23546 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23547 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23548 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23549 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23550 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23551 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
23552 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23553 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23554 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
23555 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
23556 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23557 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23558 // CHECK:   ret void
test_vuzpq_u16(uint16x8_t a,uint16x8_t b)23559 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
23560   return vuzpq_u16(a, b);
23561 }
23562 
23563 // CHECK-LABEL: define void @test_vuzpq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23564 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
23565 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23566 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23567 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23568 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23569 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23570 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23571 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23572 // CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
23573 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23574 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23575 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
23576 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
23577 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23578 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23579 // CHECK:   ret void
test_vuzpq_u32(uint32x4_t a,uint32x4_t b)23580 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
23581   return vuzpq_u32(a, b);
23582 }
23583 
23584 // CHECK-LABEL: define void @test_vuzpq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
23585 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
23586 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23587 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
23588 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
23589 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
23590 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
23591 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
23592 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23593 // CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
23594 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
23595 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23596 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
23597 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
23598 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23599 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23600 // CHECK:   ret void
test_vuzpq_f32(float32x4_t a,float32x4_t b)23601 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
23602   return vuzpq_f32(a, b);
23603 }
23604 
23605 // CHECK-LABEL: define void @test_vuzpq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23606 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
23607 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23608 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23609 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
23610 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
23611 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23612 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
23613 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
23614 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
23615 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23616 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23617 // CHECK:   ret void
test_vuzpq_p8(poly8x16_t a,poly8x16_t b)23618 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
23619   return vuzpq_p8(a, b);
23620 }
23621 
23622 // CHECK-LABEL: define void @test_vuzpq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23623 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
23624 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23625 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23626 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23627 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23628 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23629 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23630 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23631 // CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
23632 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23633 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23634 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
23635 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
23636 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23637 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23638 // CHECK:   ret void
test_vuzpq_p16(poly16x8_t a,poly16x8_t b)23639 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
23640   return vuzpq_p16(a, b);
23641 }
23642 
23643 
23644 // CHECK-LABEL: define void @test_vzip_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23645 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
23646 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23647 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23648 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23649 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
23650 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23651 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23652 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
23653 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
23654 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23655 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23656 // CHECK:   ret void
test_vzip_s8(int8x8_t a,int8x8_t b)23657 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
23658   return vzip_s8(a, b);
23659 }
23660 
23661 // CHECK-LABEL: define void @test_vzip_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23662 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
23663 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23664 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23665 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23666 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23667 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23668 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23669 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23670 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
23671 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23672 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23673 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
23674 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
23675 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23676 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23677 // CHECK:   ret void
test_vzip_s16(int16x4_t a,int16x4_t b)23678 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
23679   return vzip_s16(a, b);
23680 }
23681 
23682 // CHECK-LABEL: define void @test_vzip_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23683 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
23684 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23685 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23686 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23687 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23688 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23689 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23690 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23691 // CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
23692 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23693 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23694 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
23695 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
23696 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23697 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23698 // CHECK:   ret void
test_vzip_s32(int32x2_t a,int32x2_t b)23699 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
23700   return vzip_s32(a, b);
23701 }
23702 
23703 // CHECK-LABEL: define void @test_vzip_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23704 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
23705 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23706 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23707 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23708 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
23709 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23710 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23711 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
23712 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
23713 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23714 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23715 // CHECK:   ret void
test_vzip_u8(uint8x8_t a,uint8x8_t b)23716 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
23717   return vzip_u8(a, b);
23718 }
23719 
23720 // CHECK-LABEL: define void @test_vzip_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23721 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
23722 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23723 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23724 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23725 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23726 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23727 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23728 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23729 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
23730 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23731 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23732 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
23733 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
23734 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23735 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23736 // CHECK:   ret void
test_vzip_u16(uint16x4_t a,uint16x4_t b)23737 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
23738   return vzip_u16(a, b);
23739 }
23740 
23741 // CHECK-LABEL: define void @test_vzip_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23742 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
23743 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23744 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23745 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23746 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23747 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23748 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23749 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23750 // CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
23751 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23752 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23753 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
23754 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
23755 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23756 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23757 // CHECK:   ret void
test_vzip_u32(uint32x2_t a,uint32x2_t b)23758 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
23759   return vzip_u32(a, b);
23760 }
23761 
23762 // CHECK-LABEL: define void @test_vzip_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
23763 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
23764 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23765 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
23766 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
23767 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
23768 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
23769 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
23770 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
23771 // CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
23772 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
23773 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
23774 // CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
23775 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
23776 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23777 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23778 // CHECK:   ret void
test_vzip_f32(float32x2_t a,float32x2_t b)23779 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
23780   return vzip_f32(a, b);
23781 }
23782 
23783 // CHECK-LABEL: define void @test_vzip_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23784 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
23785 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23786 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23787 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23788 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
23789 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23790 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23791 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
23792 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
23793 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23794 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23795 // CHECK:   ret void
test_vzip_p8(poly8x8_t a,poly8x8_t b)23796 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
23797   return vzip_p8(a, b);
23798 }
23799 
23800 // CHECK-LABEL: define void @test_vzip_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23801 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
23802 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23803 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23804 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23805 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23806 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23807 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23808 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23809 // CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
23810 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23811 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23812 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
23813 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
23814 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23815 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23816 // CHECK:   ret void
test_vzip_p16(poly16x4_t a,poly16x4_t b)23817 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
23818   return vzip_p16(a, b);
23819 }
23820 
23821 // CHECK-LABEL: define void @test_vzipq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23822 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
23823 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23824 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23825 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
23826 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
23827 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23828 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
23829 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
23830 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
23831 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23832 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23833 // CHECK:   ret void
test_vzipq_s8(int8x16_t a,int8x16_t b)23834 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
23835   return vzipq_s8(a, b);
23836 }
23837 
23838 // CHECK-LABEL: define void @test_vzipq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23839 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
23840 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23841 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23842 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23843 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23844 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23845 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23846 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23847 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
23848 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23849 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23850 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
23851 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
23852 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23853 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23854 // CHECK:   ret void
test_vzipq_s16(int16x8_t a,int16x8_t b)23855 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
23856   return vzipq_s16(a, b);
23857 }
23858 
23859 // CHECK-LABEL: define void @test_vzipq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23860 // CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
23861 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23862 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23863 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23864 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23865 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23866 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23867 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23868 // CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
23869 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23870 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23871 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
23872 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
23873 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23874 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23875 // CHECK:   ret void
test_vzipq_s32(int32x4_t a,int32x4_t b)23876 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
23877   return vzipq_s32(a, b);
23878 }
23879 
23880 // CHECK-LABEL: define void @test_vzipq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23881 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
23882 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23883 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23884 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
23885 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
23886 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23887 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
23888 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
23889 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
23890 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23891 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23892 // CHECK:   ret void
test_vzipq_u8(uint8x16_t a,uint8x16_t b)23893 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
23894   return vzipq_u8(a, b);
23895 }
23896 
23897 // CHECK-LABEL: define void @test_vzipq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23898 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
23899 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23900 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23901 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23902 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23903 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23904 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23905 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23906 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
23907 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23908 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23909 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
23910 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
23911 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23912 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23913 // CHECK:   ret void
test_vzipq_u16(uint16x8_t a,uint16x8_t b)23914 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
23915   return vzipq_u16(a, b);
23916 }
23917 
23918 // CHECK-LABEL: define void @test_vzipq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23919 // CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
23920 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23921 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23922 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23923 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23924 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23925 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23926 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23927 // CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
23928 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23929 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23930 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
23931 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
23932 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23933 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23934 // CHECK:   ret void
test_vzipq_u32(uint32x4_t a,uint32x4_t b)23935 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
23936   return vzipq_u32(a, b);
23937 }
23938 
23939 // CHECK-LABEL: define void @test_vzipq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
23940 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
23941 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23942 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
23943 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
23944 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
23945 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
23946 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
23947 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23948 // CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
23949 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
23950 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23951 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
23952 // CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
23953 // CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23954 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23955 // CHECK:   ret void
test_vzipq_f32(float32x4_t a,float32x4_t b)23956 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
23957   return vzipq_f32(a, b);
23958 }
23959 
23960 // CHECK-LABEL: define void @test_vzipq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23961 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
23962 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23963 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23964 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
23965 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
23966 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23967 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
23968 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
23969 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
23970 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23971 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23972 // CHECK:   ret void
test_vzipq_p8(poly8x16_t a,poly8x16_t b)23973 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
23974   return vzipq_p8(a, b);
23975 }
23976 
23977 // CHECK-LABEL: define void @test_vzipq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23978 // CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
23979 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23980 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23981 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23982 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23983 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23984 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23985 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23986 // CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
23987 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23988 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23989 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
23990 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
23991 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23992 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23993 // CHECK:   ret void
test_vzipq_p16(poly16x8_t a,poly16x8_t b)23994 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
23995   return vzipq_p16(a, b);
23996 }
23997 
23998 
23999