1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86ShuffleDecodeConstantPool.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/EHPersonalities.h"
30 #include "llvm/CodeGen/IntrinsicLowering.h"
31 #include "llvm/CodeGen/MachineFrameInfo.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
35 #include "llvm/CodeGen/MachineModuleInfo.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/WinEHFuncInfo.h"
38 #include "llvm/IR/CallSite.h"
39 #include "llvm/IR/CallingConv.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalAlias.h"
44 #include "llvm/IR/GlobalVariable.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/Intrinsics.h"
47 #include "llvm/MC/MCAsmInfo.h"
48 #include "llvm/MC/MCContext.h"
49 #include "llvm/MC/MCExpr.h"
50 #include "llvm/MC/MCSymbol.h"
51 #include "llvm/Support/CommandLine.h"
52 #include "llvm/Support/Debug.h"
53 #include "llvm/Support/ErrorHandling.h"
54 #include "llvm/Support/MathExtras.h"
55 #include "llvm/Target/TargetOptions.h"
56 #include "X86IntrinsicsInfo.h"
57 #include <bitset>
58 #include <numeric>
59 #include <cctype>
60 using namespace llvm;
61
62 #define DEBUG_TYPE "x86-isel"
63
64 STATISTIC(NumTailCalls, "Number of tail calls");
65
66 static cl::opt<bool> ExperimentalVectorWideningLegalization(
67 "x86-experimental-vector-widening-legalization", cl::init(false),
68 cl::desc("Enable an experimental vector type legalization through widening "
69 "rather than promotion."),
70 cl::Hidden);
71
X86TargetLowering(const X86TargetMachine & TM,const X86Subtarget & STI)72 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
73 const X86Subtarget &STI)
74 : TargetLowering(TM), Subtarget(STI) {
75 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
76 X86ScalarSSEf64 = Subtarget.hasSSE2();
77 X86ScalarSSEf32 = Subtarget.hasSSE1();
78 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
79
80 // Set up the TargetLowering object.
81
82 // X86 is weird. It always uses i8 for shift amounts and setcc results.
83 setBooleanContents(ZeroOrOneBooleanContent);
84 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
85 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
86
87 // For 64-bit, since we have so many registers, use the ILP scheduler.
88 // For 32-bit, use the register pressure specific scheduling.
89 // For Atom, always use ILP scheduling.
90 if (Subtarget.isAtom())
91 setSchedulingPreference(Sched::ILP);
92 else if (Subtarget.is64Bit())
93 setSchedulingPreference(Sched::ILP);
94 else
95 setSchedulingPreference(Sched::RegPressure);
96 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
97 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
98
99 // Bypass expensive divides on Atom when compiling with O2.
100 if (TM.getOptLevel() >= CodeGenOpt::Default) {
101 if (Subtarget.hasSlowDivide32())
102 addBypassSlowDiv(32, 8);
103 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
104 addBypassSlowDiv(64, 16);
105 }
106
107 if (Subtarget.isTargetKnownWindowsMSVC()) {
108 // Setup Windows compiler runtime calls.
109 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
110 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
111 setLibcallName(RTLIB::SREM_I64, "_allrem");
112 setLibcallName(RTLIB::UREM_I64, "_aullrem");
113 setLibcallName(RTLIB::MUL_I64, "_allmul");
114 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
115 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
116 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
117 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
118 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
119 }
120
121 if (Subtarget.isTargetDarwin()) {
122 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
123 setUseUnderscoreSetJmp(false);
124 setUseUnderscoreLongJmp(false);
125 } else if (Subtarget.isTargetWindowsGNU()) {
126 // MS runtime is weird: it exports _setjmp, but longjmp!
127 setUseUnderscoreSetJmp(true);
128 setUseUnderscoreLongJmp(false);
129 } else {
130 setUseUnderscoreSetJmp(true);
131 setUseUnderscoreLongJmp(true);
132 }
133
134 // Set up the register classes.
135 addRegisterClass(MVT::i8, &X86::GR8RegClass);
136 addRegisterClass(MVT::i16, &X86::GR16RegClass);
137 addRegisterClass(MVT::i32, &X86::GR32RegClass);
138 if (Subtarget.is64Bit())
139 addRegisterClass(MVT::i64, &X86::GR64RegClass);
140
141 for (MVT VT : MVT::integer_valuetypes())
142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
143
144 // We don't accept any truncstore of integer registers.
145 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
146 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
147 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
148 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
149 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
150 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
151
152 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
153
154 // SETOEQ and SETUNE require checking two conditions.
155 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
156 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
157 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
158 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
159 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
160 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
161
162 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
163 // operation.
164 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
165 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
166 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
167
168 if (Subtarget.is64Bit()) {
169 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
170 // f32/f64 are legal, f80 is custom.
171 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
172 else
173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
174 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
175 } else if (!Subtarget.useSoftFloat()) {
176 // We have an algorithm for SSE2->double, and we turn this into a
177 // 64-bit FILD followed by conditional FADD for other targets.
178 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
179 // We have an algorithm for SSE2, and we turn this into a 64-bit
180 // FILD or VCVTUSI2SS/SD for other targets.
181 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
182 }
183
184 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
185 // this operation.
186 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
187 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
188
189 if (!Subtarget.useSoftFloat()) {
190 // SSE has no i16 to fp conversion, only i32
191 if (X86ScalarSSEf32) {
192 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
193 // f32 and f64 cases are Legal, f80 case is not
194 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
195 } else {
196 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
197 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
198 }
199 } else {
200 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
201 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
202 }
203
204 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
205 // this operation.
206 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
207 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
208
209 if (!Subtarget.useSoftFloat()) {
210 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
211 // are Legal, f80 is custom lowered.
212 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
213 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
214
215 if (X86ScalarSSEf32) {
216 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
217 // f32 and f64 cases are Legal, f80 case is not
218 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
219 } else {
220 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
221 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
222 }
223 } else {
224 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
225 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
226 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
227 }
228
229 // Handle FP_TO_UINT by promoting the destination to a larger signed
230 // conversion.
231 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
232 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
233 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
234
235 if (Subtarget.is64Bit()) {
236 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
237 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
238 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
239 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
240 } else {
241 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
242 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
243 }
244 } else if (!Subtarget.useSoftFloat()) {
245 // Since AVX is a superset of SSE3, only check for SSE here.
246 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
247 // Expand FP_TO_UINT into a select.
248 // FIXME: We would like to use a Custom expander here eventually to do
249 // the optimal thing for SSE vs. the default expansion in the legalizer.
250 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
251 else
252 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
253 // With SSE3 we can use fisttpll to convert to a signed i64; without
254 // SSE, we're stuck with a fistpll.
255 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
256
257 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
258 }
259
260 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
261 if (!X86ScalarSSEf64) {
262 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
263 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
264 if (Subtarget.is64Bit()) {
265 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
266 // Without SSE, i64->f64 goes through memory.
267 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
268 }
269 } else if (!Subtarget.is64Bit())
270 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
271
272 // Scalar integer divide and remainder are lowered to use operations that
273 // produce two results, to match the available instructions. This exposes
274 // the two-result form to trivial CSE, which is able to combine x/y and x%y
275 // into a single instruction.
276 //
277 // Scalar integer multiply-high is also lowered to use two-result
278 // operations, to match the available instructions. However, plain multiply
279 // (low) operations are left as Legal, as there are single-result
280 // instructions for this in x86. Using the two-result multiply instructions
281 // when both high and low results are needed must be arranged by dagcombine.
282 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
283 setOperationAction(ISD::MULHS, VT, Expand);
284 setOperationAction(ISD::MULHU, VT, Expand);
285 setOperationAction(ISD::SDIV, VT, Expand);
286 setOperationAction(ISD::UDIV, VT, Expand);
287 setOperationAction(ISD::SREM, VT, Expand);
288 setOperationAction(ISD::UREM, VT, Expand);
289
290 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
291 setOperationAction(ISD::ADDC, VT, Custom);
292 setOperationAction(ISD::ADDE, VT, Custom);
293 setOperationAction(ISD::SUBC, VT, Custom);
294 setOperationAction(ISD::SUBE, VT, Custom);
295 }
296
297 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
298 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
299 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
300 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
301 setOperationAction(ISD::BR_CC, VT, Expand);
302 setOperationAction(ISD::SELECT_CC, VT, Expand);
303 }
304 if (Subtarget.is64Bit())
305 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
306 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
307 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
308 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
309 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
310
311 setOperationAction(ISD::FREM , MVT::f32 , Expand);
312 setOperationAction(ISD::FREM , MVT::f64 , Expand);
313 setOperationAction(ISD::FREM , MVT::f80 , Expand);
314 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
315
316 // Promote the i8 variants and force them on up to i32 which has a shorter
317 // encoding.
318 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
319 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
320 if (!Subtarget.hasBMI()) {
321 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
322 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
323 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
324 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
325 if (Subtarget.is64Bit()) {
326 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
327 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
328 }
329 }
330
331 if (Subtarget.hasLZCNT()) {
332 // When promoting the i8 variants, force them to i32 for a shorter
333 // encoding.
334 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
335 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
336 } else {
337 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
338 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
339 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
340 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
341 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
342 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
343 if (Subtarget.is64Bit()) {
344 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
345 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
346 }
347 }
348
349 // Special handling for half-precision floating point conversions.
350 // If we don't have F16C support, then lower half float conversions
351 // into library calls.
352 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
353 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
354 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
355 }
356
357 // There's never any support for operations beyond MVT::f32.
358 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
359 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
360 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
361 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
362
363 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
364 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
365 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
366 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
367 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
368 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
369
370 if (Subtarget.hasPOPCNT()) {
371 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
372 } else {
373 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
374 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
375 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
376 if (Subtarget.is64Bit())
377 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
378 }
379
380 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
381
382 if (!Subtarget.hasMOVBE())
383 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
384
385 // These should be promoted to a larger select which is supported.
386 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
387 // X86 wants to expand cmov itself.
388 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
389 setOperationAction(ISD::SELECT, VT, Custom);
390 setOperationAction(ISD::SETCC, VT, Custom);
391 }
392 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
393 if (VT == MVT::i64 && !Subtarget.is64Bit())
394 continue;
395 setOperationAction(ISD::SELECT, VT, Custom);
396 setOperationAction(ISD::SETCC, VT, Custom);
397 setOperationAction(ISD::SETCCE, VT, Custom);
398 }
399 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
400 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
401 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
402 // support continuation, user-level threading, and etc.. As a result, no
403 // other SjLj exception interfaces are implemented and please don't build
404 // your own exception handling based on them.
405 // LLVM/Clang supports zero-cost DWARF exception handling.
406 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
407 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
408 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
409 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
410 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
411
412 // Darwin ABI issue.
413 for (auto VT : { MVT::i32, MVT::i64 }) {
414 if (VT == MVT::i64 && !Subtarget.is64Bit())
415 continue;
416 setOperationAction(ISD::ConstantPool , VT, Custom);
417 setOperationAction(ISD::JumpTable , VT, Custom);
418 setOperationAction(ISD::GlobalAddress , VT, Custom);
419 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
420 setOperationAction(ISD::ExternalSymbol , VT, Custom);
421 setOperationAction(ISD::BlockAddress , VT, Custom);
422 }
423 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
424 for (auto VT : { MVT::i32, MVT::i64 }) {
425 if (VT == MVT::i64 && !Subtarget.is64Bit())
426 continue;
427 setOperationAction(ISD::SHL_PARTS, VT, Custom);
428 setOperationAction(ISD::SRA_PARTS, VT, Custom);
429 setOperationAction(ISD::SRL_PARTS, VT, Custom);
430 }
431
432 if (Subtarget.hasSSE1())
433 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
434
435 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
436
437 // Expand certain atomics
438 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
439 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
440 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
441 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
442 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
443 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
444 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
445 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
446 }
447
448 if (Subtarget.hasCmpxchg16b()) {
449 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
450 }
451
452 // FIXME - use subtarget debug flags
453 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
454 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
455 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
456 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
457 }
458
459 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
460 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
461
462 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
463 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
464
465 setOperationAction(ISD::TRAP, MVT::Other, Legal);
466 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
467
468 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
469 setOperationAction(ISD::VASTART , MVT::Other, Custom);
470 setOperationAction(ISD::VAEND , MVT::Other, Expand);
471 bool Is64Bit = Subtarget.is64Bit();
472 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
473 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
474
475 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
476 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
477
478 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
479
480 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
481 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
482 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
483
484 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
485 // f32 and f64 use SSE.
486 // Set up the FP register classes.
487 addRegisterClass(MVT::f32, &X86::FR32RegClass);
488 addRegisterClass(MVT::f64, &X86::FR64RegClass);
489
490 for (auto VT : { MVT::f32, MVT::f64 }) {
491 // Use ANDPD to simulate FABS.
492 setOperationAction(ISD::FABS, VT, Custom);
493
494 // Use XORP to simulate FNEG.
495 setOperationAction(ISD::FNEG, VT, Custom);
496
497 // Use ANDPD and ORPD to simulate FCOPYSIGN.
498 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
499
500 // We don't support sin/cos/fmod
501 setOperationAction(ISD::FSIN , VT, Expand);
502 setOperationAction(ISD::FCOS , VT, Expand);
503 setOperationAction(ISD::FSINCOS, VT, Expand);
504 }
505
506 // Lower this to MOVMSK plus an AND.
507 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
508 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
509
510 // Expand FP immediates into loads from the stack, except for the special
511 // cases we handle.
512 addLegalFPImmediate(APFloat(+0.0)); // xorpd
513 addLegalFPImmediate(APFloat(+0.0f)); // xorps
514 } else if (UseX87 && X86ScalarSSEf32) {
515 // Use SSE for f32, x87 for f64.
516 // Set up the FP register classes.
517 addRegisterClass(MVT::f32, &X86::FR32RegClass);
518 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
519
520 // Use ANDPS to simulate FABS.
521 setOperationAction(ISD::FABS , MVT::f32, Custom);
522
523 // Use XORP to simulate FNEG.
524 setOperationAction(ISD::FNEG , MVT::f32, Custom);
525
526 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
527
528 // Use ANDPS and ORPS to simulate FCOPYSIGN.
529 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
530 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
531
532 // We don't support sin/cos/fmod
533 setOperationAction(ISD::FSIN , MVT::f32, Expand);
534 setOperationAction(ISD::FCOS , MVT::f32, Expand);
535 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
536
537 // Special cases we handle for FP constants.
538 addLegalFPImmediate(APFloat(+0.0f)); // xorps
539 addLegalFPImmediate(APFloat(+0.0)); // FLD0
540 addLegalFPImmediate(APFloat(+1.0)); // FLD1
541 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
542 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
543
544 if (!TM.Options.UnsafeFPMath) {
545 setOperationAction(ISD::FSIN , MVT::f64, Expand);
546 setOperationAction(ISD::FCOS , MVT::f64, Expand);
547 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
548 }
549 } else if (UseX87) {
550 // f32 and f64 in x87.
551 // Set up the FP register classes.
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
553 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
554
555 for (auto VT : { MVT::f32, MVT::f64 }) {
556 setOperationAction(ISD::UNDEF, VT, Expand);
557 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
558
559 if (!TM.Options.UnsafeFPMath) {
560 setOperationAction(ISD::FSIN , VT, Expand);
561 setOperationAction(ISD::FCOS , VT, Expand);
562 setOperationAction(ISD::FSINCOS, VT, Expand);
563 }
564 }
565 addLegalFPImmediate(APFloat(+0.0)); // FLD0
566 addLegalFPImmediate(APFloat(+1.0)); // FLD1
567 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
568 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
569 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
570 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
571 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
572 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
573 }
574
575 // We don't support FMA.
576 setOperationAction(ISD::FMA, MVT::f64, Expand);
577 setOperationAction(ISD::FMA, MVT::f32, Expand);
578
579 // Long double always uses X87, except f128 in MMX.
580 if (UseX87) {
581 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
582 addRegisterClass(MVT::f128, &X86::FR128RegClass);
583 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
584 setOperationAction(ISD::FABS , MVT::f128, Custom);
585 setOperationAction(ISD::FNEG , MVT::f128, Custom);
586 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
587 }
588
589 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
590 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
591 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
592 {
593 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
594 addLegalFPImmediate(TmpFlt); // FLD0
595 TmpFlt.changeSign();
596 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
597
598 bool ignored;
599 APFloat TmpFlt2(+1.0);
600 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
601 &ignored);
602 addLegalFPImmediate(TmpFlt2); // FLD1
603 TmpFlt2.changeSign();
604 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
605 }
606
607 if (!TM.Options.UnsafeFPMath) {
608 setOperationAction(ISD::FSIN , MVT::f80, Expand);
609 setOperationAction(ISD::FCOS , MVT::f80, Expand);
610 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
611 }
612
613 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
614 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
615 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
616 setOperationAction(ISD::FRINT, MVT::f80, Expand);
617 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
618 setOperationAction(ISD::FMA, MVT::f80, Expand);
619 }
620
621 // Always use a library call for pow.
622 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
623 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
624 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
625
626 setOperationAction(ISD::FLOG, MVT::f80, Expand);
627 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
628 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
629 setOperationAction(ISD::FEXP, MVT::f80, Expand);
630 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
631 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
632 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
633
634 // Some FP actions are always expanded for vector types.
635 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
636 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
637 setOperationAction(ISD::FSIN, VT, Expand);
638 setOperationAction(ISD::FSINCOS, VT, Expand);
639 setOperationAction(ISD::FCOS, VT, Expand);
640 setOperationAction(ISD::FREM, VT, Expand);
641 setOperationAction(ISD::FPOWI, VT, Expand);
642 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
643 setOperationAction(ISD::FPOW, VT, Expand);
644 setOperationAction(ISD::FLOG, VT, Expand);
645 setOperationAction(ISD::FLOG2, VT, Expand);
646 setOperationAction(ISD::FLOG10, VT, Expand);
647 setOperationAction(ISD::FEXP, VT, Expand);
648 setOperationAction(ISD::FEXP2, VT, Expand);
649 }
650
651 // First set operation action for all vector types to either promote
652 // (for widening) or expand (for scalarization). Then we will selectively
653 // turn on ones that can be effectively codegen'd.
654 for (MVT VT : MVT::vector_valuetypes()) {
655 setOperationAction(ISD::SDIV, VT, Expand);
656 setOperationAction(ISD::UDIV, VT, Expand);
657 setOperationAction(ISD::SREM, VT, Expand);
658 setOperationAction(ISD::UREM, VT, Expand);
659 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
660 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
661 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
662 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
663 setOperationAction(ISD::FMA, VT, Expand);
664 setOperationAction(ISD::FFLOOR, VT, Expand);
665 setOperationAction(ISD::FCEIL, VT, Expand);
666 setOperationAction(ISD::FTRUNC, VT, Expand);
667 setOperationAction(ISD::FRINT, VT, Expand);
668 setOperationAction(ISD::FNEARBYINT, VT, Expand);
669 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
670 setOperationAction(ISD::MULHS, VT, Expand);
671 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
672 setOperationAction(ISD::MULHU, VT, Expand);
673 setOperationAction(ISD::SDIVREM, VT, Expand);
674 setOperationAction(ISD::UDIVREM, VT, Expand);
675 setOperationAction(ISD::CTPOP, VT, Expand);
676 setOperationAction(ISD::CTTZ, VT, Expand);
677 setOperationAction(ISD::CTLZ, VT, Expand);
678 setOperationAction(ISD::ROTL, VT, Expand);
679 setOperationAction(ISD::ROTR, VT, Expand);
680 setOperationAction(ISD::BSWAP, VT, Expand);
681 setOperationAction(ISD::SETCC, VT, Expand);
682 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
683 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
684 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
685 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
686 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
687 setOperationAction(ISD::TRUNCATE, VT, Expand);
688 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
689 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
690 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
691 setOperationAction(ISD::SELECT_CC, VT, Expand);
692 for (MVT InnerVT : MVT::vector_valuetypes()) {
693 setTruncStoreAction(InnerVT, VT, Expand);
694
695 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
696 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
697
698 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
699 // types, we have to deal with them whether we ask for Expansion or not.
700 // Setting Expand causes its own optimisation problems though, so leave
701 // them legal.
702 if (VT.getVectorElementType() == MVT::i1)
703 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
704
705 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
706 // split/scalarized right now.
707 if (VT.getVectorElementType() == MVT::f16)
708 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
709 }
710 }
711
712 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
713 // with -msoft-float, disable use of MMX as well.
714 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
715 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
716 // No operations on x86mmx supported, everything uses intrinsics.
717 }
718
719 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
720 addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
721
722 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
723 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
724 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
725 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
726 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
727 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
728 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
729 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
730 }
731
732 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
733 addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
734
735 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
736 // registers cannot be used even for integer operations.
737 addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
738 addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
739 addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
740 addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
741
742 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
743 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
744 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
745 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
746 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
747 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
748 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
749 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
750 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
751 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
752 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
753 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
754
755 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
756 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
757 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
758 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
759
760 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
761 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
762 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
763 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
764
765 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
766 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
767 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
768 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
769 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
770
771 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
772 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
773 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
774 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
775
776 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
777 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
778 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
779 // ISD::CTTZ v2i64 - scalarization is faster.
780
781 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
782 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
783 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
784 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
785 setOperationAction(ISD::VSELECT, VT, Custom);
786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
787 }
788
789 // We support custom legalizing of sext and anyext loads for specific
790 // memory vector types which we can load as a scalar (or sequence of
791 // scalars) and extend in-register to a legal 128-bit vector type. For sext
792 // loads these must work with a single scalar load.
793 for (MVT VT : MVT::integer_vector_valuetypes()) {
794 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
795 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
796 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
797 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
798 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
799 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
800 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
801 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
802 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
803 }
804
805 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
806 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
807 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
808 setOperationAction(ISD::VSELECT, VT, Custom);
809
810 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
811 continue;
812
813 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
815 }
816
817 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
818 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
819 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
820 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
821 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
822 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
823 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
824 }
825
826 // Custom lower v2i64 and v2f64 selects.
827 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
828 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
829
830 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
831 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
832
833 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
834
835 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
836 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
837 // As there is no 64-bit GPR available, we need build a special custom
838 // sequence to convert from v2i32 to v2f32.
839 if (!Subtarget.is64Bit())
840 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
841
842 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
843 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
844
845 for (MVT VT : MVT::fp_vector_valuetypes())
846 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
847
848 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
849 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
850 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
851
852 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
853 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
854 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
855
856 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
857 setOperationAction(ISD::SRL, VT, Custom);
858 setOperationAction(ISD::SHL, VT, Custom);
859 setOperationAction(ISD::SRA, VT, Custom);
860 }
861
862 // In the customized shift lowering, the legal cases in AVX2 will be
863 // recognized.
864 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
865 setOperationAction(ISD::SRL, VT, Custom);
866 setOperationAction(ISD::SHL, VT, Custom);
867 setOperationAction(ISD::SRA, VT, Custom);
868 }
869 }
870
871 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
872 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
873 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
874 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
875 // ISD::CTLZ v4i32 - scalarization is faster.
876 // ISD::CTLZ v2i64 - scalarization is faster.
877 }
878
879 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
880 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
881 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
882 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
883 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
884 setOperationAction(ISD::FRINT, RoundedTy, Legal);
885 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
886 }
887
888 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
889 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
890 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
891 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
892 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
893 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
894 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
895 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
896
897 // FIXME: Do we need to handle scalar-to-vector here?
898 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
899
900 // We directly match byte blends in the backend as they match the VSELECT
901 // condition form.
902 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
903
904 // SSE41 brings specific instructions for doing vector sign extend even in
905 // cases where we don't have SRA.
906 for (MVT VT : MVT::integer_vector_valuetypes()) {
907 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
908 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
909 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
910 }
911
912 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
913 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
914 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
915 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
916 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
917 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
918 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
919
920 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
921 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
922 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
923 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
924 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
925 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
926
927 // i8 vectors are custom because the source register and source
928 // source memory operand types are not the same width.
929 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
930 }
931
932 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
933 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
934 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
935 setOperationAction(ISD::ROTL, VT, Custom);
936
937 // XOP can efficiently perform BITREVERSE with VPPERM.
938 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
939 setOperationAction(ISD::BITREVERSE, VT, Custom);
940
941 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
942 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
943 setOperationAction(ISD::BITREVERSE, VT, Custom);
944 }
945
946 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
947 bool HasInt256 = Subtarget.hasInt256();
948
949 addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
950 addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
951 addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
952 addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
953 addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
954 addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
955
956 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
957 setOperationAction(ISD::FFLOOR, VT, Legal);
958 setOperationAction(ISD::FCEIL, VT, Legal);
959 setOperationAction(ISD::FTRUNC, VT, Legal);
960 setOperationAction(ISD::FRINT, VT, Legal);
961 setOperationAction(ISD::FNEARBYINT, VT, Legal);
962 setOperationAction(ISD::FNEG, VT, Custom);
963 setOperationAction(ISD::FABS, VT, Custom);
964 }
965
966 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
967 // even though v8i16 is a legal type.
968 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
969 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
970 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
971
972 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
973 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
974 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
975
976 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
977 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
978
979 for (MVT VT : MVT::fp_vector_valuetypes())
980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
981
982 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
983 setOperationAction(ISD::SRL, VT, Custom);
984 setOperationAction(ISD::SHL, VT, Custom);
985 setOperationAction(ISD::SRA, VT, Custom);
986 }
987
988 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
989 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
990 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
991 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
992
993 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
994 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
995 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
996
997 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
998 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
999 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1000 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1001 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1002 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1003 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1004 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1005 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1006 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1007 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1008 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1009 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1010
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::CTPOP, VT, Custom);
1013 setOperationAction(ISD::CTTZ, VT, Custom);
1014 }
1015
1016 // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
1017 // as we end up splitting the 256-bit vectors.
1018 for (auto VT : { MVT::v32i8, MVT::v16i16 })
1019 setOperationAction(ISD::CTLZ, VT, Custom);
1020
1021 if (HasInt256)
1022 for (auto VT : { MVT::v8i32, MVT::v4i64 })
1023 setOperationAction(ISD::CTLZ, VT, Custom);
1024
1025 if (Subtarget.hasAnyFMA()) {
1026 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1027 MVT::v2f64, MVT::v4f64 })
1028 setOperationAction(ISD::FMA, VT, Legal);
1029 }
1030
1031 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1032 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1033 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1034 }
1035
1036 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1037 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1038 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1039 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1040
1041 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1042 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1043
1044 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1045 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1046 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1047 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1048
1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1050 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1054 }
1055
1056 if (HasInt256) {
1057 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1058 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1059 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1060
1061 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1062 // when we have a 256bit-wide blend with immediate.
1063 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1064
1065 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1066 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1067 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1068 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1069 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1070 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1071 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1072
1073 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1074 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1075 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1076 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1077 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1078 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1079 }
1080
1081 // In the customized shift lowering, the legal cases in AVX2 will be
1082 // recognized.
1083 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1084 setOperationAction(ISD::SRL, VT, Custom);
1085 setOperationAction(ISD::SHL, VT, Custom);
1086 setOperationAction(ISD::SRA, VT, Custom);
1087 }
1088
1089 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1090 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1091 setOperationAction(ISD::MLOAD, VT, Legal);
1092 setOperationAction(ISD::MSTORE, VT, Legal);
1093 }
1094
1095 // Extract subvector is special because the value type
1096 // (result) is 128-bit but the source is 256-bit wide.
1097 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1098 MVT::v4f32, MVT::v2f64 }) {
1099 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1100 }
1101
1102 // Custom lower several nodes for 256-bit types.
1103 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1104 MVT::v8f32, MVT::v4f64 }) {
1105 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1106 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1107 setOperationAction(ISD::VSELECT, VT, Custom);
1108 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1109 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1110 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1111 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1112 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1113 }
1114
1115 if (HasInt256)
1116 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1117
1118 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1119 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1120 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1121 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1122 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1125 }
1126 }
1127
1128 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1129 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1130 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1131 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1132 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1133
1134 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1135 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1136 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1137
1138 for (MVT VT : MVT::fp_vector_valuetypes())
1139 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1140
1141 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1142 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1143 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1144 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1145 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1146 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1147 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1148 }
1149 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1150 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1151 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1152 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1153 setOperationAction(ISD::XOR, MVT::i1, Legal);
1154 setOperationAction(ISD::OR, MVT::i1, Legal);
1155 setOperationAction(ISD::AND, MVT::i1, Legal);
1156 setOperationAction(ISD::SUB, MVT::i1, Custom);
1157 setOperationAction(ISD::ADD, MVT::i1, Custom);
1158 setOperationAction(ISD::MUL, MVT::i1, Custom);
1159
1160 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1161 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1162 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1163 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1164 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1165 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1166 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1167 setTruncStoreAction(VT, MaskVT, Custom);
1168 }
1169
1170 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1171 setOperationAction(ISD::FNEG, VT, Custom);
1172 setOperationAction(ISD::FABS, VT, Custom);
1173 setOperationAction(ISD::FMA, VT, Legal);
1174 }
1175
1176 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1177 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1178 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1179 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1180 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1181 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1183 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1184 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1187 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1189 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1190 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1191 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1192
1193 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1194 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1195 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1196 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1197 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1198 if (Subtarget.hasVLX()){
1199 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1200 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1201 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1202 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1203 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1204
1205 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1206 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1207 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1208 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1209 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1210 } else {
1211 setOperationAction(ISD::MLOAD, MVT::v8i32, Custom);
1212 setOperationAction(ISD::MLOAD, MVT::v8f32, Custom);
1213 setOperationAction(ISD::MSTORE, MVT::v8i32, Custom);
1214 setOperationAction(ISD::MSTORE, MVT::v8f32, Custom);
1215 }
1216 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1217 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1218 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1219 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1220 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1221 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1222 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1223 if (Subtarget.hasDQI()) {
1224 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1225 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1226 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1227 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1228 if (Subtarget.hasVLX()) {
1229 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1230 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1231 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1232 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1233 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1234 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1235 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1236 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1237 }
1238 }
1239 if (Subtarget.hasVLX()) {
1240 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1241 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1242 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1243 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1244 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1245 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1246 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1247 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1248 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1249 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1250
1251 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1252 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1253 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1254 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1255 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1256 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1258 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1259 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1260 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1261 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1262 }
1263
1264 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1265 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1266 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1267 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1268 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1269 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1270 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1271 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1272 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1273 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1274 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1276 if (Subtarget.hasDQI()) {
1277 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1278 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1279 }
1280 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1281 setOperationAction(ISD::FFLOOR, VT, Legal);
1282 setOperationAction(ISD::FCEIL, VT, Legal);
1283 setOperationAction(ISD::FTRUNC, VT, Legal);
1284 setOperationAction(ISD::FRINT, VT, Legal);
1285 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1286 }
1287
1288 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1289 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1290 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1291 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1292 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1293
1294 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1295 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1296
1297 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1298
1299 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1300 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1301 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1302 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1303 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1304 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1305 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1306 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1307 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1308 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1309 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1310 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1311
1312 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1313 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1314 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1315 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1316 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1317 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1318 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1319 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1320
1321 setOperationAction(ISD::ADD, MVT::v8i1, Expand);
1322 setOperationAction(ISD::ADD, MVT::v16i1, Expand);
1323 setOperationAction(ISD::SUB, MVT::v8i1, Expand);
1324 setOperationAction(ISD::SUB, MVT::v16i1, Expand);
1325 setOperationAction(ISD::MUL, MVT::v8i1, Expand);
1326 setOperationAction(ISD::MUL, MVT::v16i1, Expand);
1327
1328 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1329
1330 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331 setOperationAction(ISD::SRL, VT, Custom);
1332 setOperationAction(ISD::SHL, VT, Custom);
1333 setOperationAction(ISD::SRA, VT, Custom);
1334 setOperationAction(ISD::AND, VT, Legal);
1335 setOperationAction(ISD::OR, VT, Legal);
1336 setOperationAction(ISD::XOR, VT, Legal);
1337 setOperationAction(ISD::CTPOP, VT, Custom);
1338 setOperationAction(ISD::CTTZ, VT, Custom);
1339 }
1340
1341 if (Subtarget.hasCDI()) {
1342 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1343 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1344
1345 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1346 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1347 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1348 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1349
1350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1351 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1352
1353 if (Subtarget.hasVLX()) {
1354 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1355 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1356 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1357 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1358 } else {
1359 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1360 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1361 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1362 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1363 }
1364
1365 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1366 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1367 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1368 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1369 } // Subtarget.hasCDI()
1370
1371 if (Subtarget.hasDQI()) {
1372 if (Subtarget.hasVLX()) {
1373 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1374 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1375 }
1376 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1377 }
1378 // Custom lower several nodes.
1379 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1380 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1381 setOperationAction(ISD::MGATHER, VT, Custom);
1382 setOperationAction(ISD::MSCATTER, VT, Custom);
1383 }
1384 // Extract subvector is special because the value type
1385 // (result) is 256-bit but the source is 512-bit wide.
1386 // 128-bit was made Custom under AVX1.
1387 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1388 MVT::v8f32, MVT::v4f64 })
1389 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1390 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1391 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1392 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1393
1394 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1395 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1396 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1397 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1398 setOperationAction(ISD::VSELECT, VT, Legal);
1399 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1401 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1402 setOperationAction(ISD::MLOAD, VT, Legal);
1403 setOperationAction(ISD::MSTORE, VT, Legal);
1404 setOperationAction(ISD::MGATHER, VT, Legal);
1405 setOperationAction(ISD::MSCATTER, VT, Custom);
1406 }
1407 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1408 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1409 }
1410 }// has AVX-512
1411
1412 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1413 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1414 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1415
1416 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1417 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1418
1419 setOperationAction(ISD::ADD, MVT::v32i1, Expand);
1420 setOperationAction(ISD::ADD, MVT::v64i1, Expand);
1421 setOperationAction(ISD::SUB, MVT::v32i1, Expand);
1422 setOperationAction(ISD::SUB, MVT::v64i1, Expand);
1423 setOperationAction(ISD::MUL, MVT::v32i1, Expand);
1424 setOperationAction(ISD::MUL, MVT::v64i1, Expand);
1425
1426 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1427 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1428 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1429 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1430 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1431 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1432 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1433 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1434 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1435 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1436 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1437 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1438 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
1440 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1441 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1442 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1443 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1444 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1445 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1446 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1447 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1448 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1449 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1450 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1451 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1452 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1453 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1454 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1455 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1456 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1457 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1458 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1459 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1460 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1461 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1462 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1463 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1464 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1465 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1466 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1467 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1468 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1469 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1470 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1471
1472 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1473 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1474 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1475 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1476 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1477 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1478 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1479 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1480
1481 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1482 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1483 if (Subtarget.hasVLX())
1484 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1485
1486 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1487 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1488 setOperationAction(ISD::MLOAD, VT, Action);
1489 setOperationAction(ISD::MSTORE, VT, Action);
1490 }
1491
1492 if (Subtarget.hasCDI()) {
1493 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1494 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1495 }
1496
1497 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1498 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1499 setOperationAction(ISD::VSELECT, VT, Legal);
1500 setOperationAction(ISD::SRL, VT, Custom);
1501 setOperationAction(ISD::SHL, VT, Custom);
1502 setOperationAction(ISD::SRA, VT, Custom);
1503 setOperationAction(ISD::MLOAD, VT, Legal);
1504 setOperationAction(ISD::MSTORE, VT, Legal);
1505 setOperationAction(ISD::CTPOP, VT, Custom);
1506 setOperationAction(ISD::CTTZ, VT, Custom);
1507
1508 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1509 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1510 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1511 }
1512
1513 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1514 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1515 if (Subtarget.hasVLX()) {
1516 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1517 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1518 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1519 }
1520 }
1521 }
1522
1523 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1524 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1525 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1526
1527 setOperationAction(ISD::ADD, MVT::v2i1, Expand);
1528 setOperationAction(ISD::ADD, MVT::v4i1, Expand);
1529 setOperationAction(ISD::SUB, MVT::v2i1, Expand);
1530 setOperationAction(ISD::SUB, MVT::v4i1, Expand);
1531 setOperationAction(ISD::MUL, MVT::v2i1, Expand);
1532 setOperationAction(ISD::MUL, MVT::v4i1, Expand);
1533
1534 setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
1535 setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
1536 setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
1537 setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
1538 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1539 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1540 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1541 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1542 setOperationAction(ISD::SELECT, MVT::v4i1, Custom);
1543 setOperationAction(ISD::SELECT, MVT::v2i1, Custom);
1544 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
1545 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom);
1547 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom);
1548 setOperationAction(ISD::VSELECT, MVT::v2i1, Expand);
1549 setOperationAction(ISD::VSELECT, MVT::v4i1, Expand);
1550
1551 for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
1552 setOperationAction(ISD::AND, VT, Legal);
1553 setOperationAction(ISD::OR, VT, Legal);
1554 setOperationAction(ISD::XOR, VT, Legal);
1555 }
1556
1557 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1558 setOperationAction(ISD::SMAX, VT, Legal);
1559 setOperationAction(ISD::UMAX, VT, Legal);
1560 setOperationAction(ISD::SMIN, VT, Legal);
1561 setOperationAction(ISD::UMIN, VT, Legal);
1562 }
1563 }
1564
1565 // We want to custom lower some of our intrinsics.
1566 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1567 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1568 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1569 if (!Subtarget.is64Bit()) {
1570 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1571 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1572 }
1573
1574 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1575 // handle type legalization for these operations here.
1576 //
1577 // FIXME: We really should do custom legalization for addition and
1578 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1579 // than generic legalization for 64-bit multiplication-with-overflow, though.
1580 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1581 if (VT == MVT::i64 && !Subtarget.is64Bit())
1582 continue;
1583 // Add/Sub/Mul with overflow operations are custom lowered.
1584 setOperationAction(ISD::SADDO, VT, Custom);
1585 setOperationAction(ISD::UADDO, VT, Custom);
1586 setOperationAction(ISD::SSUBO, VT, Custom);
1587 setOperationAction(ISD::USUBO, VT, Custom);
1588 setOperationAction(ISD::SMULO, VT, Custom);
1589 setOperationAction(ISD::UMULO, VT, Custom);
1590 }
1591
1592 if (!Subtarget.is64Bit()) {
1593 // These libcalls are not available in 32-bit.
1594 setLibcallName(RTLIB::SHL_I128, nullptr);
1595 setLibcallName(RTLIB::SRL_I128, nullptr);
1596 setLibcallName(RTLIB::SRA_I128, nullptr);
1597 }
1598
1599 // Combine sin / cos into one node or libcall if possible.
1600 if (Subtarget.hasSinCos()) {
1601 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1602 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1603 if (Subtarget.isTargetDarwin()) {
1604 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1605 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1606 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1607 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1608 }
1609 }
1610
1611 if (Subtarget.isTargetWin64()) {
1612 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1613 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1614 setOperationAction(ISD::SREM, MVT::i128, Custom);
1615 setOperationAction(ISD::UREM, MVT::i128, Custom);
1616 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1617 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1618 }
1619
1620 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1621 // is. We should promote the value to 64-bits to solve this.
1622 // This is what the CRT headers do - `fmodf` is an inline header
1623 // function casting to f64 and calling `fmod`.
1624 if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
1625 for (ISD::NodeType Op :
1626 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1627 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1628 if (isOperationExpand(Op, MVT::f32))
1629 setOperationAction(Op, MVT::f32, Promote);
1630
1631 // We have target-specific dag combine patterns for the following nodes:
1632 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1633 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1634 setTargetDAGCombine(ISD::BITCAST);
1635 setTargetDAGCombine(ISD::VSELECT);
1636 setTargetDAGCombine(ISD::SELECT);
1637 setTargetDAGCombine(ISD::SHL);
1638 setTargetDAGCombine(ISD::SRA);
1639 setTargetDAGCombine(ISD::SRL);
1640 setTargetDAGCombine(ISD::OR);
1641 setTargetDAGCombine(ISD::AND);
1642 setTargetDAGCombine(ISD::ADD);
1643 setTargetDAGCombine(ISD::FADD);
1644 setTargetDAGCombine(ISD::FSUB);
1645 setTargetDAGCombine(ISD::FNEG);
1646 setTargetDAGCombine(ISD::FMA);
1647 setTargetDAGCombine(ISD::FMINNUM);
1648 setTargetDAGCombine(ISD::FMAXNUM);
1649 setTargetDAGCombine(ISD::SUB);
1650 setTargetDAGCombine(ISD::LOAD);
1651 setTargetDAGCombine(ISD::MLOAD);
1652 setTargetDAGCombine(ISD::STORE);
1653 setTargetDAGCombine(ISD::MSTORE);
1654 setTargetDAGCombine(ISD::TRUNCATE);
1655 setTargetDAGCombine(ISD::ZERO_EXTEND);
1656 setTargetDAGCombine(ISD::ANY_EXTEND);
1657 setTargetDAGCombine(ISD::SIGN_EXTEND);
1658 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659 setTargetDAGCombine(ISD::SINT_TO_FP);
1660 setTargetDAGCombine(ISD::UINT_TO_FP);
1661 setTargetDAGCombine(ISD::SETCC);
1662 setTargetDAGCombine(ISD::MUL);
1663 setTargetDAGCombine(ISD::XOR);
1664 setTargetDAGCombine(ISD::MSCATTER);
1665 setTargetDAGCombine(ISD::MGATHER);
1666
1667 computeRegisterProperties(Subtarget.getRegisterInfo());
1668
1669 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1670 MaxStoresPerMemsetOptSize = 8;
1671 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1672 MaxStoresPerMemcpyOptSize = 4;
1673 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1674 MaxStoresPerMemmoveOptSize = 4;
1675 setPrefLoopAlignment(4); // 2^4 bytes.
1676
1677 // An out-of-order CPU can speculatively execute past a predictable branch,
1678 // but a conditional move could be stalled by an expensive earlier operation.
1679 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1680 EnableExtLdPromotion = true;
1681 setPrefFunctionAlignment(4); // 2^4 bytes.
1682
1683 verifyIntrinsicTables();
1684 }
1685
1686 // This has so far only been implemented for 64-bit MachO.
useLoadStackGuardNode() const1687 bool X86TargetLowering::useLoadStackGuardNode() const {
1688 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1689 }
1690
1691 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const1692 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1693 if (ExperimentalVectorWideningLegalization &&
1694 VT.getVectorNumElements() != 1 &&
1695 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1696 return TypeWidenVector;
1697
1698 return TargetLoweringBase::getPreferredVectorAction(VT);
1699 }
1700
getSetCCResultType(const DataLayout & DL,LLVMContext & Context,EVT VT) const1701 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1702 LLVMContext& Context,
1703 EVT VT) const {
1704 if (!VT.isVector())
1705 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1706
1707 if (VT.isSimple()) {
1708 MVT VVT = VT.getSimpleVT();
1709 const unsigned NumElts = VVT.getVectorNumElements();
1710 MVT EltVT = VVT.getVectorElementType();
1711 if (VVT.is512BitVector()) {
1712 if (Subtarget.hasAVX512())
1713 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1714 EltVT == MVT::f32 || EltVT == MVT::f64)
1715 switch(NumElts) {
1716 case 8: return MVT::v8i1;
1717 case 16: return MVT::v16i1;
1718 }
1719 if (Subtarget.hasBWI())
1720 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1721 switch(NumElts) {
1722 case 32: return MVT::v32i1;
1723 case 64: return MVT::v64i1;
1724 }
1725 }
1726
1727 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1728 return MVT::getVectorVT(MVT::i1, NumElts);
1729
1730 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1731 EVT LegalVT = getTypeToTransformTo(Context, VT);
1732 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1733 }
1734
1735 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1736 switch(NumElts) {
1737 case 2: return MVT::v2i1;
1738 case 4: return MVT::v4i1;
1739 case 8: return MVT::v8i1;
1740 }
1741 }
1742
1743 return VT.changeVectorElementTypeToInteger();
1744 }
1745
1746 /// Helper for getByValTypeAlignment to determine
1747 /// the desired ByVal argument alignment.
getMaxByValAlign(Type * Ty,unsigned & MaxAlign)1748 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1749 if (MaxAlign == 16)
1750 return;
1751 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1752 if (VTy->getBitWidth() == 128)
1753 MaxAlign = 16;
1754 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1755 unsigned EltAlign = 0;
1756 getMaxByValAlign(ATy->getElementType(), EltAlign);
1757 if (EltAlign > MaxAlign)
1758 MaxAlign = EltAlign;
1759 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1760 for (auto *EltTy : STy->elements()) {
1761 unsigned EltAlign = 0;
1762 getMaxByValAlign(EltTy, EltAlign);
1763 if (EltAlign > MaxAlign)
1764 MaxAlign = EltAlign;
1765 if (MaxAlign == 16)
1766 break;
1767 }
1768 }
1769 }
1770
1771 /// Return the desired alignment for ByVal aggregate
1772 /// function arguments in the caller parameter area. For X86, aggregates
1773 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1774 /// are at 4-byte boundaries.
getByValTypeAlignment(Type * Ty,const DataLayout & DL) const1775 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1776 const DataLayout &DL) const {
1777 if (Subtarget.is64Bit()) {
1778 // Max of 8 and alignment of type.
1779 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1780 if (TyAlign > 8)
1781 return TyAlign;
1782 return 8;
1783 }
1784
1785 unsigned Align = 4;
1786 if (Subtarget.hasSSE1())
1787 getMaxByValAlign(Ty, Align);
1788 return Align;
1789 }
1790
1791 /// Returns the target specific optimal type for load
1792 /// and store operations as a result of memset, memcpy, and memmove
1793 /// lowering. If DstAlign is zero that means it's safe to destination
1794 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1795 /// means there isn't a need to check it against alignment requirement,
1796 /// probably because the source does not need to be loaded. If 'IsMemset' is
1797 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1798 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1799 /// source is constant so it does not need to be loaded.
1800 /// It returns EVT::Other if the type should be determined using generic
1801 /// target-independent logic.
1802 EVT
getOptimalMemOpType(uint64_t Size,unsigned DstAlign,unsigned SrcAlign,bool IsMemset,bool ZeroMemset,bool MemcpyStrSrc,MachineFunction & MF) const1803 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1804 unsigned DstAlign, unsigned SrcAlign,
1805 bool IsMemset, bool ZeroMemset,
1806 bool MemcpyStrSrc,
1807 MachineFunction &MF) const {
1808 const Function *F = MF.getFunction();
1809 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1810 if (Size >= 16 &&
1811 (!Subtarget.isUnalignedMem16Slow() ||
1812 ((DstAlign == 0 || DstAlign >= 16) &&
1813 (SrcAlign == 0 || SrcAlign >= 16)))) {
1814 // FIXME: Check if unaligned 32-byte accesses are slow.
1815 if (Size >= 32 && Subtarget.hasAVX()) {
1816 // Although this isn't a well-supported type for AVX1, we'll let
1817 // legalization and shuffle lowering produce the optimal codegen. If we
1818 // choose an optimal type with a vector element larger than a byte,
1819 // getMemsetStores() may create an intermediate splat (using an integer
1820 // multiply) before we splat as a vector.
1821 return MVT::v32i8;
1822 }
1823 if (Subtarget.hasSSE2())
1824 return MVT::v16i8;
1825 // TODO: Can SSE1 handle a byte vector?
1826 if (Subtarget.hasSSE1())
1827 return MVT::v4f32;
1828 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1829 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1830 // Do not use f64 to lower memcpy if source is string constant. It's
1831 // better to use i32 to avoid the loads.
1832 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1833 // The gymnastics of splatting a byte value into an XMM register and then
1834 // only using 8-byte stores (because this is a CPU with slow unaligned
1835 // 16-byte accesses) makes that a loser.
1836 return MVT::f64;
1837 }
1838 }
1839 // This is a compromise. If we reach here, unaligned accesses may be slow on
1840 // this target. However, creating smaller, aligned accesses could be even
1841 // slower and would certainly be a lot more code.
1842 if (Subtarget.is64Bit() && Size >= 8)
1843 return MVT::i64;
1844 return MVT::i32;
1845 }
1846
isSafeMemOpType(MVT VT) const1847 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1848 if (VT == MVT::f32)
1849 return X86ScalarSSEf32;
1850 else if (VT == MVT::f64)
1851 return X86ScalarSSEf64;
1852 return true;
1853 }
1854
1855 bool
allowsMisalignedMemoryAccesses(EVT VT,unsigned,unsigned,bool * Fast) const1856 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1857 unsigned,
1858 unsigned,
1859 bool *Fast) const {
1860 if (Fast) {
1861 switch (VT.getSizeInBits()) {
1862 default:
1863 // 8-byte and under are always assumed to be fast.
1864 *Fast = true;
1865 break;
1866 case 128:
1867 *Fast = !Subtarget.isUnalignedMem16Slow();
1868 break;
1869 case 256:
1870 *Fast = !Subtarget.isUnalignedMem32Slow();
1871 break;
1872 // TODO: What about AVX-512 (512-bit) accesses?
1873 }
1874 }
1875 // Misaligned accesses of any size are always allowed.
1876 return true;
1877 }
1878
1879 /// Return the entry encoding for a jump table in the
1880 /// current function. The returned value is a member of the
1881 /// MachineJumpTableInfo::JTEntryKind enum.
getJumpTableEncoding() const1882 unsigned X86TargetLowering::getJumpTableEncoding() const {
1883 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1884 // symbol.
1885 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1886 return MachineJumpTableInfo::EK_Custom32;
1887
1888 // Otherwise, use the normal jump table encoding heuristics.
1889 return TargetLowering::getJumpTableEncoding();
1890 }
1891
useSoftFloat() const1892 bool X86TargetLowering::useSoftFloat() const {
1893 return Subtarget.useSoftFloat();
1894 }
1895
1896 const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * MJTI,const MachineBasicBlock * MBB,unsigned uid,MCContext & Ctx) const1897 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1898 const MachineBasicBlock *MBB,
1899 unsigned uid,MCContext &Ctx) const{
1900 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1901 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1902 // entries.
1903 return MCSymbolRefExpr::create(MBB->getSymbol(),
1904 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1905 }
1906
1907 /// Returns relocation base for the given PIC jumptable.
getPICJumpTableRelocBase(SDValue Table,SelectionDAG & DAG) const1908 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1909 SelectionDAG &DAG) const {
1910 if (!Subtarget.is64Bit())
1911 // This doesn't have SDLoc associated with it, but is not really the
1912 // same as a Register.
1913 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1914 getPointerTy(DAG.getDataLayout()));
1915 return Table;
1916 }
1917
1918 /// This returns the relocation base for the given PIC jumptable,
1919 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1920 const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction * MF,unsigned JTI,MCContext & Ctx) const1921 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1922 MCContext &Ctx) const {
1923 // X86-64 uses RIP relative addressing based on the jump table label.
1924 if (Subtarget.isPICStyleRIPRel())
1925 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1926
1927 // Otherwise, the reference is relative to the PIC base.
1928 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1929 }
1930
1931 std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo * TRI,MVT VT) const1932 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1933 MVT VT) const {
1934 const TargetRegisterClass *RRC = nullptr;
1935 uint8_t Cost = 1;
1936 switch (VT.SimpleTy) {
1937 default:
1938 return TargetLowering::findRepresentativeClass(TRI, VT);
1939 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1940 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1941 break;
1942 case MVT::x86mmx:
1943 RRC = &X86::VR64RegClass;
1944 break;
1945 case MVT::f32: case MVT::f64:
1946 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1947 case MVT::v4f32: case MVT::v2f64:
1948 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1949 case MVT::v4f64:
1950 RRC = &X86::VR128RegClass;
1951 break;
1952 }
1953 return std::make_pair(RRC, Cost);
1954 }
1955
getAddressSpace() const1956 unsigned X86TargetLowering::getAddressSpace() const {
1957 if (Subtarget.is64Bit())
1958 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1959 return 256;
1960 }
1961
getIRStackGuard(IRBuilder<> & IRB) const1962 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1963 // glibc has a special slot for the stack guard in tcbhead_t, use it instead
1964 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
1965 if (!Subtarget.isTargetGlibc())
1966 return TargetLowering::getIRStackGuard(IRB);
1967
1968 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1969 // %gs:0x14 on i386
1970 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
1971 unsigned AddressSpace = getAddressSpace();
1972 return ConstantExpr::getIntToPtr(
1973 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1974 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1975 }
1976
insertSSPDeclarations(Module & M) const1977 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
1978 // MSVC CRT provides functionalities for stack protection.
1979 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
1980 // MSVC CRT has a global variable holding security cookie.
1981 M.getOrInsertGlobal("__security_cookie",
1982 Type::getInt8PtrTy(M.getContext()));
1983
1984 // MSVC CRT has a function to validate security cookie.
1985 auto *SecurityCheckCookie = cast<Function>(
1986 M.getOrInsertFunction("__security_check_cookie",
1987 Type::getVoidTy(M.getContext()),
1988 Type::getInt8PtrTy(M.getContext()), nullptr));
1989 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
1990 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
1991 return;
1992 }
1993 // glibc has a special slot for the stack guard.
1994 if (Subtarget.isTargetGlibc())
1995 return;
1996 TargetLowering::insertSSPDeclarations(M);
1997 }
1998
getSDagStackGuard(const Module & M) const1999 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2000 // MSVC CRT has a global variable holding security cookie.
2001 if (Subtarget.getTargetTriple().isOSMSVCRT())
2002 return M.getGlobalVariable("__security_cookie");
2003 return TargetLowering::getSDagStackGuard(M);
2004 }
2005
getSSPStackGuardCheck(const Module & M) const2006 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2007 // MSVC CRT has a function to validate security cookie.
2008 if (Subtarget.getTargetTriple().isOSMSVCRT())
2009 return M.getFunction("__security_check_cookie");
2010 return TargetLowering::getSSPStackGuardCheck(M);
2011 }
2012
getSafeStackPointerLocation(IRBuilder<> & IRB) const2013 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2014 if (!Subtarget.isTargetAndroid())
2015 return TargetLowering::getSafeStackPointerLocation(IRB);
2016
2017 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2018 // definition of TLS_SLOT_SAFESTACK in
2019 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2020 unsigned AddressSpace, Offset;
2021
2022 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2023 // %gs:0x24 on i386
2024 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2025 AddressSpace = getAddressSpace();
2026 return ConstantExpr::getIntToPtr(
2027 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2028 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2029 }
2030
isNoopAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const2031 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2032 unsigned DestAS) const {
2033 assert(SrcAS != DestAS && "Expected different address spaces!");
2034
2035 return SrcAS < 256 && DestAS < 256;
2036 }
2037
2038 //===----------------------------------------------------------------------===//
2039 // Return Value Calling Convention Implementation
2040 //===----------------------------------------------------------------------===//
2041
2042 #include "X86GenCallingConv.inc"
2043
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context) const2044 bool X86TargetLowering::CanLowerReturn(
2045 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2046 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2047 SmallVector<CCValAssign, 16> RVLocs;
2048 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2049 return CCInfo.CheckReturn(Outs, RetCC_X86);
2050 }
2051
getScratchRegisters(CallingConv::ID) const2052 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2053 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2054 return ScratchRegs;
2055 }
2056
2057 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & dl,SelectionDAG & DAG) const2058 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2059 bool isVarArg,
2060 const SmallVectorImpl<ISD::OutputArg> &Outs,
2061 const SmallVectorImpl<SDValue> &OutVals,
2062 const SDLoc &dl, SelectionDAG &DAG) const {
2063 MachineFunction &MF = DAG.getMachineFunction();
2064 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2065
2066 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2067 report_fatal_error("X86 interrupts may not return any value");
2068
2069 SmallVector<CCValAssign, 16> RVLocs;
2070 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2071 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2072
2073 SDValue Flag;
2074 SmallVector<SDValue, 6> RetOps;
2075 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2076 // Operand #1 = Bytes To Pop
2077 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2078 MVT::i32));
2079
2080 // Copy the result values into the output registers.
2081 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2082 CCValAssign &VA = RVLocs[i];
2083 assert(VA.isRegLoc() && "Can only return in registers!");
2084 SDValue ValToCopy = OutVals[i];
2085 EVT ValVT = ValToCopy.getValueType();
2086
2087 // Promote values to the appropriate types.
2088 if (VA.getLocInfo() == CCValAssign::SExt)
2089 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2090 else if (VA.getLocInfo() == CCValAssign::ZExt)
2091 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2092 else if (VA.getLocInfo() == CCValAssign::AExt) {
2093 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2094 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2095 else
2096 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2097 }
2098 else if (VA.getLocInfo() == CCValAssign::BCvt)
2099 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2100
2101 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2102 "Unexpected FP-extend for return value.");
2103
2104 // If this is x86-64, and we disabled SSE, we can't return FP values,
2105 // or SSE or MMX vectors.
2106 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2107 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2108 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2109 report_fatal_error("SSE register return with SSE disabled");
2110 }
2111 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2112 // llvm-gcc has never done it right and no one has noticed, so this
2113 // should be OK for now.
2114 if (ValVT == MVT::f64 &&
2115 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2116 report_fatal_error("SSE2 register return with SSE2 disabled");
2117
2118 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2119 // the RET instruction and handled by the FP Stackifier.
2120 if (VA.getLocReg() == X86::FP0 ||
2121 VA.getLocReg() == X86::FP1) {
2122 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2123 // change the value to the FP stack register class.
2124 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2125 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2126 RetOps.push_back(ValToCopy);
2127 // Don't emit a copytoreg.
2128 continue;
2129 }
2130
2131 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2132 // which is returned in RAX / RDX.
2133 if (Subtarget.is64Bit()) {
2134 if (ValVT == MVT::x86mmx) {
2135 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2136 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2137 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2138 ValToCopy);
2139 // If we don't have SSE2 available, convert to v4f32 so the generated
2140 // register is legal.
2141 if (!Subtarget.hasSSE2())
2142 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2143 }
2144 }
2145 }
2146
2147 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2148 Flag = Chain.getValue(1);
2149 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2150 }
2151
2152 // Swift calling convention does not require we copy the sret argument
2153 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2154
2155 // All x86 ABIs require that for returning structs by value we copy
2156 // the sret argument into %rax/%eax (depending on ABI) for the return.
2157 // We saved the argument into a virtual register in the entry block,
2158 // so now we copy the value out and into %rax/%eax.
2159 //
2160 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2161 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2162 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2163 // either case FuncInfo->setSRetReturnReg() will have been called.
2164 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2165 // When we have both sret and another return value, we should use the
2166 // original Chain stored in RetOps[0], instead of the current Chain updated
2167 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2168
2169 // For the case of sret and another return value, we have
2170 // Chain_0 at the function entry
2171 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2172 // If we use Chain_1 in getCopyFromReg, we will have
2173 // Val = getCopyFromReg(Chain_1)
2174 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2175
2176 // getCopyToReg(Chain_0) will be glued together with
2177 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2178 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2179 // Data dependency from Unit B to Unit A due to usage of Val in
2180 // getCopyToReg(Chain_1, Val)
2181 // Chain dependency from Unit A to Unit B
2182
2183 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2184 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2185 getPointerTy(MF.getDataLayout()));
2186
2187 unsigned RetValReg
2188 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2189 X86::RAX : X86::EAX;
2190 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2191 Flag = Chain.getValue(1);
2192
2193 // RAX/EAX now acts like a return value.
2194 RetOps.push_back(
2195 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2196 }
2197
2198 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2199 const MCPhysReg *I =
2200 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2201 if (I) {
2202 for (; *I; ++I) {
2203 if (X86::GR64RegClass.contains(*I))
2204 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2205 else
2206 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2207 }
2208 }
2209
2210 RetOps[0] = Chain; // Update chain.
2211
2212 // Add the flag if we have it.
2213 if (Flag.getNode())
2214 RetOps.push_back(Flag);
2215
2216 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2217 if (CallConv == CallingConv::X86_INTR)
2218 opcode = X86ISD::IRET;
2219 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2220 }
2221
isUsedByReturnOnly(SDNode * N,SDValue & Chain) const2222 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2223 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2224 return false;
2225
2226 SDValue TCChain = Chain;
2227 SDNode *Copy = *N->use_begin();
2228 if (Copy->getOpcode() == ISD::CopyToReg) {
2229 // If the copy has a glue operand, we conservatively assume it isn't safe to
2230 // perform a tail call.
2231 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2232 return false;
2233 TCChain = Copy->getOperand(0);
2234 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2235 return false;
2236
2237 bool HasRet = false;
2238 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2239 UI != UE; ++UI) {
2240 if (UI->getOpcode() != X86ISD::RET_FLAG)
2241 return false;
2242 // If we are returning more than one value, we can definitely
2243 // not make a tail call see PR19530
2244 if (UI->getNumOperands() > 4)
2245 return false;
2246 if (UI->getNumOperands() == 4 &&
2247 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2248 return false;
2249 HasRet = true;
2250 }
2251
2252 if (!HasRet)
2253 return false;
2254
2255 Chain = TCChain;
2256 return true;
2257 }
2258
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const2259 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2260 ISD::NodeType ExtendKind) const {
2261 MVT ReturnMVT = MVT::i32;
2262
2263 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2264 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2265 // The ABI does not require i1, i8 or i16 to be extended.
2266 //
2267 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2268 // always extending i8/i16 return values, so keep doing that for now.
2269 // (PR26665).
2270 ReturnMVT = MVT::i8;
2271 }
2272
2273 EVT MinVT = getRegisterType(Context, ReturnMVT);
2274 return VT.bitsLT(MinVT) ? MinVT : VT;
2275 }
2276
2277 /// Lower the result values of a call into the
2278 /// appropriate copies out of appropriate physical registers.
2279 ///
LowerCallResult(SDValue Chain,SDValue InFlag,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2280 SDValue X86TargetLowering::LowerCallResult(
2281 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2282 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2283 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2284
2285 // Assign locations to each value returned by this call.
2286 SmallVector<CCValAssign, 16> RVLocs;
2287 bool Is64Bit = Subtarget.is64Bit();
2288 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2289 *DAG.getContext());
2290 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2291
2292 // Copy all of the result registers out of their specified physreg.
2293 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2294 CCValAssign &VA = RVLocs[i];
2295 EVT CopyVT = VA.getLocVT();
2296
2297 // If this is x86-64, and we disabled SSE, we can't return FP values
2298 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2299 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2300 report_fatal_error("SSE register return with SSE disabled");
2301 }
2302
2303 // If we prefer to use the value in xmm registers, copy it out as f80 and
2304 // use a truncate to move it from fp stack reg to xmm reg.
2305 bool RoundAfterCopy = false;
2306 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2307 isScalarFPTypeInSSEReg(VA.getValVT())) {
2308 if (!Subtarget.hasX87())
2309 report_fatal_error("X87 register return with X87 disabled");
2310 CopyVT = MVT::f80;
2311 RoundAfterCopy = (CopyVT != VA.getLocVT());
2312 }
2313
2314 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2315 CopyVT, InFlag).getValue(1);
2316 SDValue Val = Chain.getValue(0);
2317
2318 if (RoundAfterCopy)
2319 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2320 // This truncation won't change the value.
2321 DAG.getIntPtrConstant(1, dl));
2322
2323 if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
2324 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2325
2326 InFlag = Chain.getValue(2);
2327 InVals.push_back(Val);
2328 }
2329
2330 return Chain;
2331 }
2332
2333 //===----------------------------------------------------------------------===//
2334 // C & StdCall & Fast Calling Convention implementation
2335 //===----------------------------------------------------------------------===//
2336 // StdCall calling convention seems to be standard for many Windows' API
2337 // routines and around. It differs from C calling convention just a little:
2338 // callee should clean up the stack, not caller. Symbols should be also
2339 // decorated in some fancy way :) It doesn't support any vector arguments.
2340 // For info on fast calling convention see Fast Calling Convention (tail call)
2341 // implementation LowerX86_32FastCCCallTo.
2342
2343 /// CallIsStructReturn - Determines whether a call uses struct return
2344 /// semantics.
2345 enum StructReturnType {
2346 NotStructReturn,
2347 RegStructReturn,
2348 StackStructReturn
2349 };
2350 static StructReturnType
callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> & Outs,bool IsMCU)2351 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2352 if (Outs.empty())
2353 return NotStructReturn;
2354
2355 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2356 if (!Flags.isSRet())
2357 return NotStructReturn;
2358 if (Flags.isInReg() || IsMCU)
2359 return RegStructReturn;
2360 return StackStructReturn;
2361 }
2362
2363 /// Determines whether a function uses struct return semantics.
2364 static StructReturnType
argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> & Ins,bool IsMCU)2365 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2366 if (Ins.empty())
2367 return NotStructReturn;
2368
2369 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2370 if (!Flags.isSRet())
2371 return NotStructReturn;
2372 if (Flags.isInReg() || IsMCU)
2373 return RegStructReturn;
2374 return StackStructReturn;
2375 }
2376
2377 /// Make a copy of an aggregate at address specified by "Src" to address
2378 /// "Dst" with size and alignment information specified by the specific
2379 /// parameter attribute. The copy will be passed as a byval function parameter.
CreateCopyOfByValArgument(SDValue Src,SDValue Dst,SDValue Chain,ISD::ArgFlagsTy Flags,SelectionDAG & DAG,const SDLoc & dl)2380 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2381 SDValue Chain, ISD::ArgFlagsTy Flags,
2382 SelectionDAG &DAG, const SDLoc &dl) {
2383 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2384
2385 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2386 /*isVolatile*/false, /*AlwaysInline=*/true,
2387 /*isTailCall*/false,
2388 MachinePointerInfo(), MachinePointerInfo());
2389 }
2390
2391 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)2392 static bool canGuaranteeTCO(CallingConv::ID CC) {
2393 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2394 CC == CallingConv::HiPE || CC == CallingConv::HHVM);
2395 }
2396
2397 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)2398 static bool mayTailCallThisCC(CallingConv::ID CC) {
2399 switch (CC) {
2400 // C calling conventions:
2401 case CallingConv::C:
2402 case CallingConv::X86_64_Win64:
2403 case CallingConv::X86_64_SysV:
2404 // Callee pop conventions:
2405 case CallingConv::X86_ThisCall:
2406 case CallingConv::X86_StdCall:
2407 case CallingConv::X86_VectorCall:
2408 case CallingConv::X86_FastCall:
2409 return true;
2410 default:
2411 return canGuaranteeTCO(CC);
2412 }
2413 }
2414
2415 /// Return true if the function is being made into a tailcall target by
2416 /// changing its ABI.
shouldGuaranteeTCO(CallingConv::ID CC,bool GuaranteedTailCallOpt)2417 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2418 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2419 }
2420
mayBeEmittedAsTailCall(CallInst * CI) const2421 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2422 auto Attr =
2423 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2424 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2425 return false;
2426
2427 CallSite CS(CI);
2428 CallingConv::ID CalleeCC = CS.getCallingConv();
2429 if (!mayTailCallThisCC(CalleeCC))
2430 return false;
2431
2432 return true;
2433 }
2434
2435 SDValue
LowerMemArgument(SDValue Chain,CallingConv::ID CallConv,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,MachineFrameInfo * MFI,unsigned i) const2436 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2437 const SmallVectorImpl<ISD::InputArg> &Ins,
2438 const SDLoc &dl, SelectionDAG &DAG,
2439 const CCValAssign &VA,
2440 MachineFrameInfo *MFI, unsigned i) const {
2441 // Create the nodes corresponding to a load from this parameter slot.
2442 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2443 bool AlwaysUseMutable = shouldGuaranteeTCO(
2444 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2445 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2446 EVT ValVT;
2447
2448 // If value is passed by pointer we have address passed instead of the value
2449 // itself.
2450 bool ExtendedInMem = VA.isExtInLoc() &&
2451 VA.getValVT().getScalarType() == MVT::i1;
2452
2453 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2454 ValVT = VA.getLocVT();
2455 else
2456 ValVT = VA.getValVT();
2457
2458 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2459 // taken by a return address.
2460 int Offset = 0;
2461 if (CallConv == CallingConv::X86_INTR) {
2462 const X86Subtarget& Subtarget =
2463 static_cast<const X86Subtarget&>(DAG.getSubtarget());
2464 // X86 interrupts may take one or two arguments.
2465 // On the stack there will be no return address as in regular call.
2466 // Offset of last argument need to be set to -4/-8 bytes.
2467 // Where offset of the first argument out of two, should be set to 0 bytes.
2468 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2469 }
2470
2471 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2472 // changed with more analysis.
2473 // In case of tail call optimization mark all arguments mutable. Since they
2474 // could be overwritten by lowering of arguments in case of a tail call.
2475 if (Flags.isByVal()) {
2476 unsigned Bytes = Flags.getByValSize();
2477 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2478 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2479 // Adjust SP offset of interrupt parameter.
2480 if (CallConv == CallingConv::X86_INTR) {
2481 MFI->setObjectOffset(FI, Offset);
2482 }
2483 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2484 } else {
2485 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2486 VA.getLocMemOffset(), isImmutable);
2487
2488 // Set SExt or ZExt flag.
2489 if (VA.getLocInfo() == CCValAssign::ZExt) {
2490 MFI->setObjectZExt(FI, true);
2491 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2492 MFI->setObjectSExt(FI, true);
2493 }
2494
2495 // Adjust SP offset of interrupt parameter.
2496 if (CallConv == CallingConv::X86_INTR) {
2497 MFI->setObjectOffset(FI, Offset);
2498 }
2499
2500 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2501 SDValue Val = DAG.getLoad(
2502 ValVT, dl, Chain, FIN,
2503 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
2504 false, false, 0);
2505 return ExtendedInMem ?
2506 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2507 }
2508 }
2509
2510 // FIXME: Get this from tablegen.
get64BitArgumentGPRs(CallingConv::ID CallConv,const X86Subtarget & Subtarget)2511 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2512 const X86Subtarget &Subtarget) {
2513 assert(Subtarget.is64Bit());
2514
2515 if (Subtarget.isCallingConvWin64(CallConv)) {
2516 static const MCPhysReg GPR64ArgRegsWin64[] = {
2517 X86::RCX, X86::RDX, X86::R8, X86::R9
2518 };
2519 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2520 }
2521
2522 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2523 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2524 };
2525 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2526 }
2527
2528 // FIXME: Get this from tablegen.
get64BitArgumentXMMs(MachineFunction & MF,CallingConv::ID CallConv,const X86Subtarget & Subtarget)2529 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2530 CallingConv::ID CallConv,
2531 const X86Subtarget &Subtarget) {
2532 assert(Subtarget.is64Bit());
2533 if (Subtarget.isCallingConvWin64(CallConv)) {
2534 // The XMM registers which might contain var arg parameters are shadowed
2535 // in their paired GPR. So we only need to save the GPR to their home
2536 // slots.
2537 // TODO: __vectorcall will change this.
2538 return None;
2539 }
2540
2541 const Function *Fn = MF.getFunction();
2542 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2543 bool isSoftFloat = Subtarget.useSoftFloat();
2544 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2545 "SSE register cannot be used when SSE is disabled!");
2546 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2547 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2548 // registers.
2549 return None;
2550
2551 static const MCPhysReg XMMArgRegs64Bit[] = {
2552 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2553 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2554 };
2555 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2556 }
2557
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2558 SDValue X86TargetLowering::LowerFormalArguments(
2559 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2560 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2561 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2562 MachineFunction &MF = DAG.getMachineFunction();
2563 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2564 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2565
2566 const Function *Fn = MF.getFunction();
2567 if (Fn->hasExternalLinkage() &&
2568 Subtarget.isTargetCygMing() &&
2569 Fn->getName() == "main")
2570 FuncInfo->setForceFramePointer(true);
2571
2572 MachineFrameInfo *MFI = MF.getFrameInfo();
2573 bool Is64Bit = Subtarget.is64Bit();
2574 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2575
2576 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2577 "Var args not supported with calling convention fastcc, ghc or hipe");
2578
2579 if (CallConv == CallingConv::X86_INTR) {
2580 bool isLegal = Ins.size() == 1 ||
2581 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2582 (!Is64Bit && Ins[1].VT == MVT::i32)));
2583 if (!isLegal)
2584 report_fatal_error("X86 interrupts may take one or two arguments");
2585 }
2586
2587 // Assign locations to all of the incoming arguments.
2588 SmallVector<CCValAssign, 16> ArgLocs;
2589 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2590
2591 // Allocate shadow area for Win64
2592 if (IsWin64)
2593 CCInfo.AllocateStack(32, 8);
2594
2595 CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2596
2597 unsigned LastVal = ~0U;
2598 SDValue ArgValue;
2599 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2600 CCValAssign &VA = ArgLocs[i];
2601 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2602 // places.
2603 assert(VA.getValNo() != LastVal &&
2604 "Don't support value assigned to multiple locs yet");
2605 (void)LastVal;
2606 LastVal = VA.getValNo();
2607
2608 if (VA.isRegLoc()) {
2609 EVT RegVT = VA.getLocVT();
2610 const TargetRegisterClass *RC;
2611 if (RegVT == MVT::i32)
2612 RC = &X86::GR32RegClass;
2613 else if (Is64Bit && RegVT == MVT::i64)
2614 RC = &X86::GR64RegClass;
2615 else if (RegVT == MVT::f32)
2616 RC = &X86::FR32RegClass;
2617 else if (RegVT == MVT::f64)
2618 RC = &X86::FR64RegClass;
2619 else if (RegVT == MVT::f128)
2620 RC = &X86::FR128RegClass;
2621 else if (RegVT.is512BitVector())
2622 RC = &X86::VR512RegClass;
2623 else if (RegVT.is256BitVector())
2624 RC = &X86::VR256RegClass;
2625 else if (RegVT.is128BitVector())
2626 RC = &X86::VR128RegClass;
2627 else if (RegVT == MVT::x86mmx)
2628 RC = &X86::VR64RegClass;
2629 else if (RegVT == MVT::i1)
2630 RC = &X86::VK1RegClass;
2631 else if (RegVT == MVT::v8i1)
2632 RC = &X86::VK8RegClass;
2633 else if (RegVT == MVT::v16i1)
2634 RC = &X86::VK16RegClass;
2635 else if (RegVT == MVT::v32i1)
2636 RC = &X86::VK32RegClass;
2637 else if (RegVT == MVT::v64i1)
2638 RC = &X86::VK64RegClass;
2639 else
2640 llvm_unreachable("Unknown argument type!");
2641
2642 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2643 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2644
2645 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2646 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2647 // right size.
2648 if (VA.getLocInfo() == CCValAssign::SExt)
2649 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2650 DAG.getValueType(VA.getValVT()));
2651 else if (VA.getLocInfo() == CCValAssign::ZExt)
2652 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2653 DAG.getValueType(VA.getValVT()));
2654 else if (VA.getLocInfo() == CCValAssign::BCvt)
2655 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2656
2657 if (VA.isExtInLoc()) {
2658 // Handle MMX values passed in XMM regs.
2659 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2660 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2661 else
2662 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2663 }
2664 } else {
2665 assert(VA.isMemLoc());
2666 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2667 }
2668
2669 // If value is passed via pointer - do a load.
2670 if (VA.getLocInfo() == CCValAssign::Indirect)
2671 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2672 MachinePointerInfo(), false, false, false, 0);
2673
2674 InVals.push_back(ArgValue);
2675 }
2676
2677 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2678 // Swift calling convention does not require we copy the sret argument
2679 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2680 if (CallConv == CallingConv::Swift)
2681 continue;
2682
2683 // All x86 ABIs require that for returning structs by value we copy the
2684 // sret argument into %rax/%eax (depending on ABI) for the return. Save
2685 // the argument into a virtual register so that we can access it from the
2686 // return points.
2687 if (Ins[i].Flags.isSRet()) {
2688 unsigned Reg = FuncInfo->getSRetReturnReg();
2689 if (!Reg) {
2690 MVT PtrTy = getPointerTy(DAG.getDataLayout());
2691 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2692 FuncInfo->setSRetReturnReg(Reg);
2693 }
2694 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2695 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2696 break;
2697 }
2698 }
2699
2700 unsigned StackSize = CCInfo.getNextStackOffset();
2701 // Align stack specially for tail calls.
2702 if (shouldGuaranteeTCO(CallConv,
2703 MF.getTarget().Options.GuaranteedTailCallOpt))
2704 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2705
2706 // If the function takes variable number of arguments, make a frame index for
2707 // the start of the first vararg value... for expansion of llvm.va_start. We
2708 // can skip this if there are no va_start calls.
2709 if (MFI->hasVAStart() &&
2710 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2711 CallConv != CallingConv::X86_ThisCall))) {
2712 FuncInfo->setVarArgsFrameIndex(
2713 MFI->CreateFixedObject(1, StackSize, true));
2714 }
2715
2716 // Figure out if XMM registers are in use.
2717 assert(!(Subtarget.useSoftFloat() &&
2718 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2719 "SSE register cannot be used when SSE is disabled!");
2720
2721 // 64-bit calling conventions support varargs and register parameters, so we
2722 // have to do extra work to spill them in the prologue.
2723 if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2724 // Find the first unallocated argument registers.
2725 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2726 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2727 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2728 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2729 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2730 "SSE register cannot be used when SSE is disabled!");
2731
2732 // Gather all the live in physical registers.
2733 SmallVector<SDValue, 6> LiveGPRs;
2734 SmallVector<SDValue, 8> LiveXMMRegs;
2735 SDValue ALVal;
2736 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2737 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2738 LiveGPRs.push_back(
2739 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2740 }
2741 if (!ArgXMMs.empty()) {
2742 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2743 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2744 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2745 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2746 LiveXMMRegs.push_back(
2747 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2748 }
2749 }
2750
2751 if (IsWin64) {
2752 // Get to the caller-allocated home save location. Add 8 to account
2753 // for the return address.
2754 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2755 FuncInfo->setRegSaveFrameIndex(
2756 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2757 // Fixup to set vararg frame on shadow area (4 x i64).
2758 if (NumIntRegs < 4)
2759 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2760 } else {
2761 // For X86-64, if there are vararg parameters that are passed via
2762 // registers, then we must store them to their spots on the stack so
2763 // they may be loaded by dereferencing the result of va_next.
2764 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2765 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2766 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2767 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2768 }
2769
2770 // Store the integer parameter registers.
2771 SmallVector<SDValue, 8> MemOps;
2772 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2773 getPointerTy(DAG.getDataLayout()));
2774 unsigned Offset = FuncInfo->getVarArgsGPOffset();
2775 for (SDValue Val : LiveGPRs) {
2776 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2777 RSFIN, DAG.getIntPtrConstant(Offset, dl));
2778 SDValue Store =
2779 DAG.getStore(Val.getValue(1), dl, Val, FIN,
2780 MachinePointerInfo::getFixedStack(
2781 DAG.getMachineFunction(),
2782 FuncInfo->getRegSaveFrameIndex(), Offset),
2783 false, false, 0);
2784 MemOps.push_back(Store);
2785 Offset += 8;
2786 }
2787
2788 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2789 // Now store the XMM (fp + vector) parameter registers.
2790 SmallVector<SDValue, 12> SaveXMMOps;
2791 SaveXMMOps.push_back(Chain);
2792 SaveXMMOps.push_back(ALVal);
2793 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2794 FuncInfo->getRegSaveFrameIndex(), dl));
2795 SaveXMMOps.push_back(DAG.getIntPtrConstant(
2796 FuncInfo->getVarArgsFPOffset(), dl));
2797 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2798 LiveXMMRegs.end());
2799 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2800 MVT::Other, SaveXMMOps));
2801 }
2802
2803 if (!MemOps.empty())
2804 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2805 }
2806
2807 if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2808 // Find the largest legal vector type.
2809 MVT VecVT = MVT::Other;
2810 // FIXME: Only some x86_32 calling conventions support AVX512.
2811 if (Subtarget.hasAVX512() &&
2812 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2813 CallConv == CallingConv::Intel_OCL_BI)))
2814 VecVT = MVT::v16f32;
2815 else if (Subtarget.hasAVX())
2816 VecVT = MVT::v8f32;
2817 else if (Subtarget.hasSSE2())
2818 VecVT = MVT::v4f32;
2819
2820 // We forward some GPRs and some vector types.
2821 SmallVector<MVT, 2> RegParmTypes;
2822 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2823 RegParmTypes.push_back(IntVT);
2824 if (VecVT != MVT::Other)
2825 RegParmTypes.push_back(VecVT);
2826
2827 // Compute the set of forwarded registers. The rest are scratch.
2828 SmallVectorImpl<ForwardedRegister> &Forwards =
2829 FuncInfo->getForwardedMustTailRegParms();
2830 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2831
2832 // Conservatively forward AL on x86_64, since it might be used for varargs.
2833 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2834 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2835 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2836 }
2837
2838 // Copy all forwards from physical to virtual registers.
2839 for (ForwardedRegister &F : Forwards) {
2840 // FIXME: Can we use a less constrained schedule?
2841 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2842 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2843 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2844 }
2845 }
2846
2847 // Some CCs need callee pop.
2848 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2849 MF.getTarget().Options.GuaranteedTailCallOpt)) {
2850 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2851 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
2852 // X86 interrupts must pop the error code if present
2853 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
2854 } else {
2855 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2856 // If this is an sret function, the return should pop the hidden pointer.
2857 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
2858 !Subtarget.getTargetTriple().isOSMSVCRT() &&
2859 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
2860 FuncInfo->setBytesToPopOnReturn(4);
2861 }
2862
2863 if (!Is64Bit) {
2864 // RegSaveFrameIndex is X86-64 only.
2865 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2866 if (CallConv == CallingConv::X86_FastCall ||
2867 CallConv == CallingConv::X86_ThisCall)
2868 // fastcc functions can't have varargs.
2869 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2870 }
2871
2872 FuncInfo->setArgumentStackSize(StackSize);
2873
2874 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
2875 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
2876 if (Personality == EHPersonality::CoreCLR) {
2877 assert(Is64Bit);
2878 // TODO: Add a mechanism to frame lowering that will allow us to indicate
2879 // that we'd prefer this slot be allocated towards the bottom of the frame
2880 // (i.e. near the stack pointer after allocating the frame). Every
2881 // funclet needs a copy of this slot in its (mostly empty) frame, and the
2882 // offset from the bottom of this and each funclet's frame must be the
2883 // same, so the size of funclets' (mostly empty) frames is dictated by
2884 // how far this slot is from the bottom (since they allocate just enough
2885 // space to accommodate holding this slot at the correct offset).
2886 int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
2887 EHInfo->PSPSymFrameIdx = PSPSymFI;
2888 }
2889 }
2890
2891 return Chain;
2892 }
2893
LowerMemOpCallTo(SDValue Chain,SDValue StackPtr,SDValue Arg,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,ISD::ArgFlagsTy Flags) const2894 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
2895 SDValue Arg, const SDLoc &dl,
2896 SelectionDAG &DAG,
2897 const CCValAssign &VA,
2898 ISD::ArgFlagsTy Flags) const {
2899 unsigned LocMemOffset = VA.getLocMemOffset();
2900 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2901 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2902 StackPtr, PtrOff);
2903 if (Flags.isByVal())
2904 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2905
2906 return DAG.getStore(
2907 Chain, dl, Arg, PtrOff,
2908 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
2909 false, false, 0);
2910 }
2911
2912 /// Emit a load of return address if tail call
2913 /// optimization is performed and it is required.
EmitTailCallLoadRetAddr(SelectionDAG & DAG,SDValue & OutRetAddr,SDValue Chain,bool IsTailCall,bool Is64Bit,int FPDiff,const SDLoc & dl) const2914 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
2915 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
2916 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
2917 // Adjust the Return address stack slot.
2918 EVT VT = getPointerTy(DAG.getDataLayout());
2919 OutRetAddr = getReturnAddressFrameIndex(DAG);
2920
2921 // Load the "old" Return address.
2922 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2923 false, false, false, 0);
2924 return SDValue(OutRetAddr.getNode(), 1);
2925 }
2926
2927 /// Emit a store of the return address if tail call
2928 /// optimization is performed and it is required (FPDiff!=0).
EmitTailCallStoreRetAddr(SelectionDAG & DAG,MachineFunction & MF,SDValue Chain,SDValue RetAddrFrIdx,EVT PtrVT,unsigned SlotSize,int FPDiff,const SDLoc & dl)2929 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2930 SDValue Chain, SDValue RetAddrFrIdx,
2931 EVT PtrVT, unsigned SlotSize,
2932 int FPDiff, const SDLoc &dl) {
2933 // Store the return address to the appropriate stack slot.
2934 if (!FPDiff) return Chain;
2935 // Calculate the new stack slot for the return address.
2936 int NewReturnAddrFI =
2937 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2938 false);
2939 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2940 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2941 MachinePointerInfo::getFixedStack(
2942 DAG.getMachineFunction(), NewReturnAddrFI),
2943 false, false, 0);
2944 return Chain;
2945 }
2946
2947 /// Returns a vector_shuffle mask for an movs{s|d}, movd
2948 /// operation of specified width.
getMOVL(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)2949 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
2950 SDValue V2) {
2951 unsigned NumElems = VT.getVectorNumElements();
2952 SmallVector<int, 8> Mask;
2953 Mask.push_back(NumElems);
2954 for (unsigned i = 1; i != NumElems; ++i)
2955 Mask.push_back(i);
2956 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
2957 }
2958
2959 SDValue
LowerCall(TargetLowering::CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const2960 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2961 SmallVectorImpl<SDValue> &InVals) const {
2962 SelectionDAG &DAG = CLI.DAG;
2963 SDLoc &dl = CLI.DL;
2964 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2965 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2966 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2967 SDValue Chain = CLI.Chain;
2968 SDValue Callee = CLI.Callee;
2969 CallingConv::ID CallConv = CLI.CallConv;
2970 bool &isTailCall = CLI.IsTailCall;
2971 bool isVarArg = CLI.IsVarArg;
2972
2973 MachineFunction &MF = DAG.getMachineFunction();
2974 bool Is64Bit = Subtarget.is64Bit();
2975 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2976 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
2977 bool IsSibcall = false;
2978 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2979 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
2980
2981 if (CallConv == CallingConv::X86_INTR)
2982 report_fatal_error("X86 interrupts may not be called directly");
2983
2984 if (Attr.getValueAsString() == "true")
2985 isTailCall = false;
2986
2987 if (Subtarget.isPICStyleGOT() &&
2988 !MF.getTarget().Options.GuaranteedTailCallOpt) {
2989 // If we are using a GOT, disable tail calls to external symbols with
2990 // default visibility. Tail calling such a symbol requires using a GOT
2991 // relocation, which forces early binding of the symbol. This breaks code
2992 // that require lazy function symbol resolution. Using musttail or
2993 // GuaranteedTailCallOpt will override this.
2994 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2995 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2996 G->getGlobal()->hasDefaultVisibility()))
2997 isTailCall = false;
2998 }
2999
3000 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3001 if (IsMustTail) {
3002 // Force this to be a tail call. The verifier rules are enough to ensure
3003 // that we can lower this successfully without moving the return address
3004 // around.
3005 isTailCall = true;
3006 } else if (isTailCall) {
3007 // Check if it's really possible to do a tail call.
3008 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3009 isVarArg, SR != NotStructReturn,
3010 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3011 Outs, OutVals, Ins, DAG);
3012
3013 // Sibcalls are automatically detected tailcalls which do not require
3014 // ABI changes.
3015 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3016 IsSibcall = true;
3017
3018 if (isTailCall)
3019 ++NumTailCalls;
3020 }
3021
3022 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3023 "Var args not supported with calling convention fastcc, ghc or hipe");
3024
3025 // Analyze operands of the call, assigning locations to each operand.
3026 SmallVector<CCValAssign, 16> ArgLocs;
3027 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3028
3029 // Allocate shadow area for Win64
3030 if (IsWin64)
3031 CCInfo.AllocateStack(32, 8);
3032
3033 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3034
3035 // Get a count of how many bytes are to be pushed on the stack.
3036 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3037 if (IsSibcall)
3038 // This is a sibcall. The memory operands are available in caller's
3039 // own caller's stack.
3040 NumBytes = 0;
3041 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3042 canGuaranteeTCO(CallConv))
3043 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3044
3045 int FPDiff = 0;
3046 if (isTailCall && !IsSibcall && !IsMustTail) {
3047 // Lower arguments at fp - stackoffset + fpdiff.
3048 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3049
3050 FPDiff = NumBytesCallerPushed - NumBytes;
3051
3052 // Set the delta of movement of the returnaddr stackslot.
3053 // But only set if delta is greater than previous delta.
3054 if (FPDiff < X86Info->getTCReturnAddrDelta())
3055 X86Info->setTCReturnAddrDelta(FPDiff);
3056 }
3057
3058 unsigned NumBytesToPush = NumBytes;
3059 unsigned NumBytesToPop = NumBytes;
3060
3061 // If we have an inalloca argument, all stack space has already been allocated
3062 // for us and be right at the top of the stack. We don't support multiple
3063 // arguments passed in memory when using inalloca.
3064 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3065 NumBytesToPush = 0;
3066 if (!ArgLocs.back().isMemLoc())
3067 report_fatal_error("cannot use inalloca attribute on a register "
3068 "parameter");
3069 if (ArgLocs.back().getLocMemOffset() != 0)
3070 report_fatal_error("any parameter with the inalloca attribute must be "
3071 "the only memory argument");
3072 }
3073
3074 if (!IsSibcall)
3075 Chain = DAG.getCALLSEQ_START(
3076 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3077
3078 SDValue RetAddrFrIdx;
3079 // Load return address for tail calls.
3080 if (isTailCall && FPDiff)
3081 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3082 Is64Bit, FPDiff, dl);
3083
3084 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3085 SmallVector<SDValue, 8> MemOpChains;
3086 SDValue StackPtr;
3087
3088 // Walk the register/memloc assignments, inserting copies/loads. In the case
3089 // of tail call optimization arguments are handle later.
3090 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3091 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3092 // Skip inalloca arguments, they have already been written.
3093 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3094 if (Flags.isInAlloca())
3095 continue;
3096
3097 CCValAssign &VA = ArgLocs[i];
3098 EVT RegVT = VA.getLocVT();
3099 SDValue Arg = OutVals[i];
3100 bool isByVal = Flags.isByVal();
3101
3102 // Promote the value if needed.
3103 switch (VA.getLocInfo()) {
3104 default: llvm_unreachable("Unknown loc info!");
3105 case CCValAssign::Full: break;
3106 case CCValAssign::SExt:
3107 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3108 break;
3109 case CCValAssign::ZExt:
3110 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3111 break;
3112 case CCValAssign::AExt:
3113 if (Arg.getValueType().isVector() &&
3114 Arg.getValueType().getVectorElementType() == MVT::i1)
3115 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3116 else if (RegVT.is128BitVector()) {
3117 // Special case: passing MMX values in XMM registers.
3118 Arg = DAG.getBitcast(MVT::i64, Arg);
3119 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3120 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3121 } else
3122 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3123 break;
3124 case CCValAssign::BCvt:
3125 Arg = DAG.getBitcast(RegVT, Arg);
3126 break;
3127 case CCValAssign::Indirect: {
3128 // Store the argument.
3129 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3130 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3131 Chain = DAG.getStore(
3132 Chain, dl, Arg, SpillSlot,
3133 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3134 false, false, 0);
3135 Arg = SpillSlot;
3136 break;
3137 }
3138 }
3139
3140 if (VA.isRegLoc()) {
3141 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3142 if (isVarArg && IsWin64) {
3143 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3144 // shadow reg if callee is a varargs function.
3145 unsigned ShadowReg = 0;
3146 switch (VA.getLocReg()) {
3147 case X86::XMM0: ShadowReg = X86::RCX; break;
3148 case X86::XMM1: ShadowReg = X86::RDX; break;
3149 case X86::XMM2: ShadowReg = X86::R8; break;
3150 case X86::XMM3: ShadowReg = X86::R9; break;
3151 }
3152 if (ShadowReg)
3153 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3154 }
3155 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3156 assert(VA.isMemLoc());
3157 if (!StackPtr.getNode())
3158 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3159 getPointerTy(DAG.getDataLayout()));
3160 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3161 dl, DAG, VA, Flags));
3162 }
3163 }
3164
3165 if (!MemOpChains.empty())
3166 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3167
3168 if (Subtarget.isPICStyleGOT()) {
3169 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3170 // GOT pointer.
3171 if (!isTailCall) {
3172 RegsToPass.push_back(std::make_pair(
3173 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3174 getPointerTy(DAG.getDataLayout()))));
3175 } else {
3176 // If we are tail calling and generating PIC/GOT style code load the
3177 // address of the callee into ECX. The value in ecx is used as target of
3178 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3179 // for tail calls on PIC/GOT architectures. Normally we would just put the
3180 // address of GOT into ebx and then call target@PLT. But for tail calls
3181 // ebx would be restored (since ebx is callee saved) before jumping to the
3182 // target@PLT.
3183
3184 // Note: The actual moving to ECX is done further down.
3185 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3186 if (G && !G->getGlobal()->hasLocalLinkage() &&
3187 G->getGlobal()->hasDefaultVisibility())
3188 Callee = LowerGlobalAddress(Callee, DAG);
3189 else if (isa<ExternalSymbolSDNode>(Callee))
3190 Callee = LowerExternalSymbol(Callee, DAG);
3191 }
3192 }
3193
3194 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3195 // From AMD64 ABI document:
3196 // For calls that may call functions that use varargs or stdargs
3197 // (prototype-less calls or calls to functions containing ellipsis (...) in
3198 // the declaration) %al is used as hidden argument to specify the number
3199 // of SSE registers used. The contents of %al do not need to match exactly
3200 // the number of registers, but must be an ubound on the number of SSE
3201 // registers used and is in the range 0 - 8 inclusive.
3202
3203 // Count the number of XMM registers allocated.
3204 static const MCPhysReg XMMArgRegs[] = {
3205 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3206 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3207 };
3208 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3209 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3210 && "SSE registers cannot be used when SSE is disabled");
3211
3212 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3213 DAG.getConstant(NumXMMRegs, dl,
3214 MVT::i8)));
3215 }
3216
3217 if (isVarArg && IsMustTail) {
3218 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3219 for (const auto &F : Forwards) {
3220 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3221 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3222 }
3223 }
3224
3225 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3226 // don't need this because the eligibility check rejects calls that require
3227 // shuffling arguments passed in memory.
3228 if (!IsSibcall && isTailCall) {
3229 // Force all the incoming stack arguments to be loaded from the stack
3230 // before any new outgoing arguments are stored to the stack, because the
3231 // outgoing stack slots may alias the incoming argument stack slots, and
3232 // the alias isn't otherwise explicit. This is slightly more conservative
3233 // than necessary, because it means that each store effectively depends
3234 // on every argument instead of just those arguments it would clobber.
3235 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3236
3237 SmallVector<SDValue, 8> MemOpChains2;
3238 SDValue FIN;
3239 int FI = 0;
3240 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3241 CCValAssign &VA = ArgLocs[i];
3242 if (VA.isRegLoc())
3243 continue;
3244 assert(VA.isMemLoc());
3245 SDValue Arg = OutVals[i];
3246 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3247 // Skip inalloca arguments. They don't require any work.
3248 if (Flags.isInAlloca())
3249 continue;
3250 // Create frame index.
3251 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3252 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3253 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3254 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3255
3256 if (Flags.isByVal()) {
3257 // Copy relative to framepointer.
3258 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3259 if (!StackPtr.getNode())
3260 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3261 getPointerTy(DAG.getDataLayout()));
3262 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3263 StackPtr, Source);
3264
3265 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3266 ArgChain,
3267 Flags, DAG, dl));
3268 } else {
3269 // Store relative to framepointer.
3270 MemOpChains2.push_back(DAG.getStore(
3271 ArgChain, dl, Arg, FIN,
3272 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3273 false, false, 0));
3274 }
3275 }
3276
3277 if (!MemOpChains2.empty())
3278 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3279
3280 // Store the return address to the appropriate stack slot.
3281 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3282 getPointerTy(DAG.getDataLayout()),
3283 RegInfo->getSlotSize(), FPDiff, dl);
3284 }
3285
3286 // Build a sequence of copy-to-reg nodes chained together with token chain
3287 // and flag operands which copy the outgoing args into registers.
3288 SDValue InFlag;
3289 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3290 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3291 RegsToPass[i].second, InFlag);
3292 InFlag = Chain.getValue(1);
3293 }
3294
3295 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3296 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3297 // In the 64-bit large code model, we have to make all calls
3298 // through a register, since the call instruction's 32-bit
3299 // pc-relative offset may not be large enough to hold the whole
3300 // address.
3301 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3302 // If the callee is a GlobalAddress node (quite common, every direct call
3303 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3304 // it.
3305 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3306
3307 // We should use extra load for direct calls to dllimported functions in
3308 // non-JIT mode.
3309 const GlobalValue *GV = G->getGlobal();
3310 if (!GV->hasDLLImportStorageClass()) {
3311 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3312
3313 Callee = DAG.getTargetGlobalAddress(
3314 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3315
3316 if (OpFlags == X86II::MO_GOTPCREL) {
3317 // Add a wrapper.
3318 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3319 getPointerTy(DAG.getDataLayout()), Callee);
3320 // Add extra indirection
3321 Callee = DAG.getLoad(
3322 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3323 MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
3324 false, 0);
3325 }
3326 }
3327 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3328 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3329 unsigned char OpFlags =
3330 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3331
3332 Callee = DAG.getTargetExternalSymbol(
3333 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3334 } else if (Subtarget.isTarget64BitILP32() &&
3335 Callee->getValueType(0) == MVT::i32) {
3336 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3337 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3338 }
3339
3340 // Returns a chain & a flag for retval copy to use.
3341 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3342 SmallVector<SDValue, 8> Ops;
3343
3344 if (!IsSibcall && isTailCall) {
3345 Chain = DAG.getCALLSEQ_END(Chain,
3346 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3347 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3348 InFlag = Chain.getValue(1);
3349 }
3350
3351 Ops.push_back(Chain);
3352 Ops.push_back(Callee);
3353
3354 if (isTailCall)
3355 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3356
3357 // Add argument registers to the end of the list so that they are known live
3358 // into the call.
3359 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3360 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3361 RegsToPass[i].second.getValueType()));
3362
3363 // Add a register mask operand representing the call-preserved registers.
3364 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3365 assert(Mask && "Missing call preserved mask for calling convention");
3366
3367 // If this is an invoke in a 32-bit function using a funclet-based
3368 // personality, assume the function clobbers all registers. If an exception
3369 // is thrown, the runtime will not restore CSRs.
3370 // FIXME: Model this more precisely so that we can register allocate across
3371 // the normal edge and spill and fill across the exceptional edge.
3372 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3373 const Function *CallerFn = MF.getFunction();
3374 EHPersonality Pers =
3375 CallerFn->hasPersonalityFn()
3376 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3377 : EHPersonality::Unknown;
3378 if (isFuncletEHPersonality(Pers))
3379 Mask = RegInfo->getNoPreservedMask();
3380 }
3381
3382 Ops.push_back(DAG.getRegisterMask(Mask));
3383
3384 if (InFlag.getNode())
3385 Ops.push_back(InFlag);
3386
3387 if (isTailCall) {
3388 // We used to do:
3389 //// If this is the first return lowered for this function, add the regs
3390 //// to the liveout set for the function.
3391 // This isn't right, although it's probably harmless on x86; liveouts
3392 // should be computed from returns not tail calls. Consider a void
3393 // function making a tail call to a function returning int.
3394 MF.getFrameInfo()->setHasTailCall();
3395 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3396 }
3397
3398 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3399 InFlag = Chain.getValue(1);
3400
3401 // Create the CALLSEQ_END node.
3402 unsigned NumBytesForCalleeToPop;
3403 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3404 DAG.getTarget().Options.GuaranteedTailCallOpt))
3405 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3406 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3407 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3408 SR == StackStructReturn)
3409 // If this is a call to a struct-return function, the callee
3410 // pops the hidden struct pointer, so we have to push it back.
3411 // This is common for Darwin/X86, Linux & Mingw32 targets.
3412 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3413 NumBytesForCalleeToPop = 4;
3414 else
3415 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3416
3417 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3418 // No need to reset the stack after the call if the call doesn't return. To
3419 // make the MI verify, we'll pretend the callee does it for us.
3420 NumBytesForCalleeToPop = NumBytes;
3421 }
3422
3423 // Returns a flag for retval copy to use.
3424 if (!IsSibcall) {
3425 Chain = DAG.getCALLSEQ_END(Chain,
3426 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3427 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3428 true),
3429 InFlag, dl);
3430 InFlag = Chain.getValue(1);
3431 }
3432
3433 // Handle result values, copying them out of physregs into vregs that we
3434 // return.
3435 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3436 Ins, dl, DAG, InVals);
3437 }
3438
3439 //===----------------------------------------------------------------------===//
3440 // Fast Calling Convention (tail call) implementation
3441 //===----------------------------------------------------------------------===//
3442
3443 // Like std call, callee cleans arguments, convention except that ECX is
3444 // reserved for storing the tail called function address. Only 2 registers are
3445 // free for argument passing (inreg). Tail call optimization is performed
3446 // provided:
3447 // * tailcallopt is enabled
3448 // * caller/callee are fastcc
3449 // On X86_64 architecture with GOT-style position independent code only local
3450 // (within module) calls are supported at the moment.
3451 // To keep the stack aligned according to platform abi the function
3452 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3453 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3454 // If a tail called function callee has more arguments than the caller the
3455 // caller needs to make sure that there is room to move the RETADDR to. This is
3456 // achieved by reserving an area the size of the argument delta right after the
3457 // original RETADDR, but before the saved framepointer or the spilled registers
3458 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3459 // stack layout:
3460 // arg1
3461 // arg2
3462 // RETADDR
3463 // [ new RETADDR
3464 // move area ]
3465 // (possible EBP)
3466 // ESI
3467 // EDI
3468 // local1 ..
3469
3470 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3471 /// requirement.
3472 unsigned
GetAlignedArgumentStackSize(unsigned StackSize,SelectionDAG & DAG) const3473 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3474 SelectionDAG& DAG) const {
3475 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3476 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3477 unsigned StackAlignment = TFI.getStackAlignment();
3478 uint64_t AlignMask = StackAlignment - 1;
3479 int64_t Offset = StackSize;
3480 unsigned SlotSize = RegInfo->getSlotSize();
3481 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3482 // Number smaller than 12 so just add the difference.
3483 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3484 } else {
3485 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3486 Offset = ((~AlignMask) & Offset) + StackAlignment +
3487 (StackAlignment-SlotSize);
3488 }
3489 return Offset;
3490 }
3491
3492 /// Return true if the given stack call argument is already available in the
3493 /// same position (relatively) of the caller's incoming argument stack.
3494 static
MatchingStackOffset(SDValue Arg,unsigned Offset,ISD::ArgFlagsTy Flags,MachineFrameInfo * MFI,const MachineRegisterInfo * MRI,const X86InstrInfo * TII,const CCValAssign & VA)3495 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3496 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3497 const X86InstrInfo *TII, const CCValAssign &VA) {
3498 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3499
3500 for (;;) {
3501 // Look through nodes that don't alter the bits of the incoming value.
3502 unsigned Op = Arg.getOpcode();
3503 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3504 Arg = Arg.getOperand(0);
3505 continue;
3506 }
3507 if (Op == ISD::TRUNCATE) {
3508 const SDValue &TruncInput = Arg.getOperand(0);
3509 if (TruncInput.getOpcode() == ISD::AssertZext &&
3510 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3511 Arg.getValueType()) {
3512 Arg = TruncInput.getOperand(0);
3513 continue;
3514 }
3515 }
3516 break;
3517 }
3518
3519 int FI = INT_MAX;
3520 if (Arg.getOpcode() == ISD::CopyFromReg) {
3521 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3522 if (!TargetRegisterInfo::isVirtualRegister(VR))
3523 return false;
3524 MachineInstr *Def = MRI->getVRegDef(VR);
3525 if (!Def)
3526 return false;
3527 if (!Flags.isByVal()) {
3528 if (!TII->isLoadFromStackSlot(*Def, FI))
3529 return false;
3530 } else {
3531 unsigned Opcode = Def->getOpcode();
3532 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3533 Opcode == X86::LEA64_32r) &&
3534 Def->getOperand(1).isFI()) {
3535 FI = Def->getOperand(1).getIndex();
3536 Bytes = Flags.getByValSize();
3537 } else
3538 return false;
3539 }
3540 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3541 if (Flags.isByVal())
3542 // ByVal argument is passed in as a pointer but it's now being
3543 // dereferenced. e.g.
3544 // define @foo(%struct.X* %A) {
3545 // tail call @bar(%struct.X* byval %A)
3546 // }
3547 return false;
3548 SDValue Ptr = Ld->getBasePtr();
3549 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3550 if (!FINode)
3551 return false;
3552 FI = FINode->getIndex();
3553 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3554 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3555 FI = FINode->getIndex();
3556 Bytes = Flags.getByValSize();
3557 } else
3558 return false;
3559
3560 assert(FI != INT_MAX);
3561 if (!MFI->isFixedObjectIndex(FI))
3562 return false;
3563
3564 if (Offset != MFI->getObjectOffset(FI))
3565 return false;
3566
3567 if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
3568 // If the argument location is wider than the argument type, check that any
3569 // extension flags match.
3570 if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
3571 Flags.isSExt() != MFI->isObjectSExt(FI)) {
3572 return false;
3573 }
3574 }
3575
3576 return Bytes == MFI->getObjectSize(FI);
3577 }
3578
3579 /// Check whether the call is eligible for tail call optimization. Targets
3580 /// that want to do tail call optimization should implement this function.
IsEligibleForTailCallOptimization(SDValue Callee,CallingConv::ID CalleeCC,bool isVarArg,bool isCalleeStructRet,bool isCallerStructRet,Type * RetTy,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SmallVectorImpl<ISD::InputArg> & Ins,SelectionDAG & DAG) const3581 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3582 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3583 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3584 const SmallVectorImpl<ISD::OutputArg> &Outs,
3585 const SmallVectorImpl<SDValue> &OutVals,
3586 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3587 if (!mayTailCallThisCC(CalleeCC))
3588 return false;
3589
3590 // If -tailcallopt is specified, make fastcc functions tail-callable.
3591 MachineFunction &MF = DAG.getMachineFunction();
3592 const Function *CallerF = MF.getFunction();
3593
3594 // If the function return type is x86_fp80 and the callee return type is not,
3595 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3596 // perform a tailcall optimization here.
3597 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3598 return false;
3599
3600 CallingConv::ID CallerCC = CallerF->getCallingConv();
3601 bool CCMatch = CallerCC == CalleeCC;
3602 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3603 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3604
3605 // Win64 functions have extra shadow space for argument homing. Don't do the
3606 // sibcall if the caller and callee have mismatched expectations for this
3607 // space.
3608 if (IsCalleeWin64 != IsCallerWin64)
3609 return false;
3610
3611 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3612 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3613 return true;
3614 return false;
3615 }
3616
3617 // Look for obvious safe cases to perform tail call optimization that do not
3618 // require ABI changes. This is what gcc calls sibcall.
3619
3620 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3621 // emit a special epilogue.
3622 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3623 if (RegInfo->needsStackRealignment(MF))
3624 return false;
3625
3626 // Also avoid sibcall optimization if either caller or callee uses struct
3627 // return semantics.
3628 if (isCalleeStructRet || isCallerStructRet)
3629 return false;
3630
3631 // Do not sibcall optimize vararg calls unless all arguments are passed via
3632 // registers.
3633 LLVMContext &C = *DAG.getContext();
3634 if (isVarArg && !Outs.empty()) {
3635 // Optimizing for varargs on Win64 is unlikely to be safe without
3636 // additional testing.
3637 if (IsCalleeWin64 || IsCallerWin64)
3638 return false;
3639
3640 SmallVector<CCValAssign, 16> ArgLocs;
3641 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3642
3643 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3644 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3645 if (!ArgLocs[i].isRegLoc())
3646 return false;
3647 }
3648
3649 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3650 // stack. Therefore, if it's not used by the call it is not safe to optimize
3651 // this into a sibcall.
3652 bool Unused = false;
3653 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3654 if (!Ins[i].Used) {
3655 Unused = true;
3656 break;
3657 }
3658 }
3659 if (Unused) {
3660 SmallVector<CCValAssign, 16> RVLocs;
3661 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3662 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3663 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3664 CCValAssign &VA = RVLocs[i];
3665 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3666 return false;
3667 }
3668 }
3669
3670 // Check that the call results are passed in the same way.
3671 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3672 RetCC_X86, RetCC_X86))
3673 return false;
3674 // The callee has to preserve all registers the caller needs to preserve.
3675 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3676 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3677 if (!CCMatch) {
3678 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3679 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3680 return false;
3681 }
3682
3683 unsigned StackArgsSize = 0;
3684
3685 // If the callee takes no arguments then go on to check the results of the
3686 // call.
3687 if (!Outs.empty()) {
3688 // Check if stack adjustment is needed. For now, do not do this if any
3689 // argument is passed on the stack.
3690 SmallVector<CCValAssign, 16> ArgLocs;
3691 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3692
3693 // Allocate shadow area for Win64
3694 if (IsCalleeWin64)
3695 CCInfo.AllocateStack(32, 8);
3696
3697 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3698 StackArgsSize = CCInfo.getNextStackOffset();
3699
3700 if (CCInfo.getNextStackOffset()) {
3701 // Check if the arguments are already laid out in the right way as
3702 // the caller's fixed stack objects.
3703 MachineFrameInfo *MFI = MF.getFrameInfo();
3704 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3705 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3706 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3707 CCValAssign &VA = ArgLocs[i];
3708 SDValue Arg = OutVals[i];
3709 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3710 if (VA.getLocInfo() == CCValAssign::Indirect)
3711 return false;
3712 if (!VA.isRegLoc()) {
3713 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3714 MFI, MRI, TII, VA))
3715 return false;
3716 }
3717 }
3718 }
3719
3720 bool PositionIndependent = isPositionIndependent();
3721 // If the tailcall address may be in a register, then make sure it's
3722 // possible to register allocate for it. In 32-bit, the call address can
3723 // only target EAX, EDX, or ECX since the tail call must be scheduled after
3724 // callee-saved registers are restored. These happen to be the same
3725 // registers used to pass 'inreg' arguments so watch out for those.
3726 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
3727 !isa<ExternalSymbolSDNode>(Callee)) ||
3728 PositionIndependent)) {
3729 unsigned NumInRegs = 0;
3730 // In PIC we need an extra register to formulate the address computation
3731 // for the callee.
3732 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
3733
3734 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3735 CCValAssign &VA = ArgLocs[i];
3736 if (!VA.isRegLoc())
3737 continue;
3738 unsigned Reg = VA.getLocReg();
3739 switch (Reg) {
3740 default: break;
3741 case X86::EAX: case X86::EDX: case X86::ECX:
3742 if (++NumInRegs == MaxInRegs)
3743 return false;
3744 break;
3745 }
3746 }
3747 }
3748
3749 const MachineRegisterInfo &MRI = MF.getRegInfo();
3750 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3751 return false;
3752 }
3753
3754 bool CalleeWillPop =
3755 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
3756 MF.getTarget().Options.GuaranteedTailCallOpt);
3757
3758 if (unsigned BytesToPop =
3759 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
3760 // If we have bytes to pop, the callee must pop them.
3761 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
3762 if (!CalleePopMatches)
3763 return false;
3764 } else if (CalleeWillPop && StackArgsSize > 0) {
3765 // If we don't have bytes to pop, make sure the callee doesn't pop any.
3766 return false;
3767 }
3768
3769 return true;
3770 }
3771
3772 FastISel *
createFastISel(FunctionLoweringInfo & funcInfo,const TargetLibraryInfo * libInfo) const3773 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3774 const TargetLibraryInfo *libInfo) const {
3775 return X86::createFastISel(funcInfo, libInfo);
3776 }
3777
3778 //===----------------------------------------------------------------------===//
3779 // Other Lowering Hooks
3780 //===----------------------------------------------------------------------===//
3781
MayFoldLoad(SDValue Op)3782 static bool MayFoldLoad(SDValue Op) {
3783 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3784 }
3785
MayFoldIntoStore(SDValue Op)3786 static bool MayFoldIntoStore(SDValue Op) {
3787 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3788 }
3789
isTargetShuffle(unsigned Opcode)3790 static bool isTargetShuffle(unsigned Opcode) {
3791 switch(Opcode) {
3792 default: return false;
3793 case X86ISD::BLENDI:
3794 case X86ISD::PSHUFB:
3795 case X86ISD::PSHUFD:
3796 case X86ISD::PSHUFHW:
3797 case X86ISD::PSHUFLW:
3798 case X86ISD::SHUFP:
3799 case X86ISD::INSERTPS:
3800 case X86ISD::PALIGNR:
3801 case X86ISD::VSHLDQ:
3802 case X86ISD::VSRLDQ:
3803 case X86ISD::MOVLHPS:
3804 case X86ISD::MOVLHPD:
3805 case X86ISD::MOVHLPS:
3806 case X86ISD::MOVLPS:
3807 case X86ISD::MOVLPD:
3808 case X86ISD::MOVSHDUP:
3809 case X86ISD::MOVSLDUP:
3810 case X86ISD::MOVDDUP:
3811 case X86ISD::MOVSS:
3812 case X86ISD::MOVSD:
3813 case X86ISD::UNPCKL:
3814 case X86ISD::UNPCKH:
3815 case X86ISD::VPERMILPI:
3816 case X86ISD::VPERMILPV:
3817 case X86ISD::VPERM2X128:
3818 case X86ISD::VPERMIL2:
3819 case X86ISD::VPERMI:
3820 case X86ISD::VPPERM:
3821 case X86ISD::VPERMV:
3822 case X86ISD::VPERMV3:
3823 case X86ISD::VZEXT_MOVL:
3824 return true;
3825 }
3826 }
3827
isTargetShuffleVariableMask(unsigned Opcode)3828 static bool isTargetShuffleVariableMask(unsigned Opcode) {
3829 switch (Opcode) {
3830 default: return false;
3831 case X86ISD::PSHUFB:
3832 case X86ISD::VPERMILPV:
3833 return true;
3834 }
3835 }
3836
getTargetShuffleNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue V1,unsigned TargetMask,SelectionDAG & DAG)3837 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3838 SDValue V1, unsigned TargetMask,
3839 SelectionDAG &DAG) {
3840 switch(Opc) {
3841 default: llvm_unreachable("Unknown x86 shuffle node");
3842 case X86ISD::PSHUFD:
3843 case X86ISD::PSHUFHW:
3844 case X86ISD::PSHUFLW:
3845 case X86ISD::VPERMILPI:
3846 case X86ISD::VPERMI:
3847 return DAG.getNode(Opc, dl, VT, V1,
3848 DAG.getConstant(TargetMask, dl, MVT::i8));
3849 }
3850 }
3851
getTargetShuffleNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2,SelectionDAG & DAG)3852 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3853 SDValue V1, SDValue V2, SelectionDAG &DAG) {
3854 switch(Opc) {
3855 default: llvm_unreachable("Unknown x86 shuffle node");
3856 case X86ISD::MOVLHPS:
3857 case X86ISD::MOVLHPD:
3858 case X86ISD::MOVHLPS:
3859 case X86ISD::MOVLPS:
3860 case X86ISD::MOVLPD:
3861 case X86ISD::MOVSS:
3862 case X86ISD::MOVSD:
3863 case X86ISD::UNPCKL:
3864 case X86ISD::UNPCKH:
3865 return DAG.getNode(Opc, dl, VT, V1, V2);
3866 }
3867 }
3868
getReturnAddressFrameIndex(SelectionDAG & DAG) const3869 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3870 MachineFunction &MF = DAG.getMachineFunction();
3871 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3872 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3873 int ReturnAddrIndex = FuncInfo->getRAIndex();
3874
3875 if (ReturnAddrIndex == 0) {
3876 // Set up a frame object for the return address.
3877 unsigned SlotSize = RegInfo->getSlotSize();
3878 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3879 -(int64_t)SlotSize,
3880 false);
3881 FuncInfo->setRAIndex(ReturnAddrIndex);
3882 }
3883
3884 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3885 }
3886
isOffsetSuitableForCodeModel(int64_t Offset,CodeModel::Model M,bool hasSymbolicDisplacement)3887 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3888 bool hasSymbolicDisplacement) {
3889 // Offset should fit into 32 bit immediate field.
3890 if (!isInt<32>(Offset))
3891 return false;
3892
3893 // If we don't have a symbolic displacement - we don't have any extra
3894 // restrictions.
3895 if (!hasSymbolicDisplacement)
3896 return true;
3897
3898 // FIXME: Some tweaks might be needed for medium code model.
3899 if (M != CodeModel::Small && M != CodeModel::Kernel)
3900 return false;
3901
3902 // For small code model we assume that latest object is 16MB before end of 31
3903 // bits boundary. We may also accept pretty large negative constants knowing
3904 // that all objects are in the positive half of address space.
3905 if (M == CodeModel::Small && Offset < 16*1024*1024)
3906 return true;
3907
3908 // For kernel code model we know that all object resist in the negative half
3909 // of 32bits address space. We may not accept negative offsets, since they may
3910 // be just off and we may accept pretty large positive ones.
3911 if (M == CodeModel::Kernel && Offset >= 0)
3912 return true;
3913
3914 return false;
3915 }
3916
3917 /// Determines whether the callee is required to pop its own arguments.
3918 /// Callee pop is necessary to support tail calls.
isCalleePop(CallingConv::ID CallingConv,bool is64Bit,bool IsVarArg,bool GuaranteeTCO)3919 bool X86::isCalleePop(CallingConv::ID CallingConv,
3920 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
3921 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
3922 // can guarantee TCO.
3923 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
3924 return true;
3925
3926 switch (CallingConv) {
3927 default:
3928 return false;
3929 case CallingConv::X86_StdCall:
3930 case CallingConv::X86_FastCall:
3931 case CallingConv::X86_ThisCall:
3932 case CallingConv::X86_VectorCall:
3933 return !is64Bit;
3934 }
3935 }
3936
3937 /// \brief Return true if the condition is an unsigned comparison operation.
isX86CCUnsigned(unsigned X86CC)3938 static bool isX86CCUnsigned(unsigned X86CC) {
3939 switch (X86CC) {
3940 default:
3941 llvm_unreachable("Invalid integer condition!");
3942 case X86::COND_E:
3943 case X86::COND_NE:
3944 case X86::COND_B:
3945 case X86::COND_A:
3946 case X86::COND_BE:
3947 case X86::COND_AE:
3948 return true;
3949 case X86::COND_G:
3950 case X86::COND_GE:
3951 case X86::COND_L:
3952 case X86::COND_LE:
3953 return false;
3954 }
3955 }
3956
TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)3957 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
3958 switch (SetCCOpcode) {
3959 default: llvm_unreachable("Invalid integer condition!");
3960 case ISD::SETEQ: return X86::COND_E;
3961 case ISD::SETGT: return X86::COND_G;
3962 case ISD::SETGE: return X86::COND_GE;
3963 case ISD::SETLT: return X86::COND_L;
3964 case ISD::SETLE: return X86::COND_LE;
3965 case ISD::SETNE: return X86::COND_NE;
3966 case ISD::SETULT: return X86::COND_B;
3967 case ISD::SETUGT: return X86::COND_A;
3968 case ISD::SETULE: return X86::COND_BE;
3969 case ISD::SETUGE: return X86::COND_AE;
3970 }
3971 }
3972
3973 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3974 /// condition code, returning the condition code and the LHS/RHS of the
3975 /// comparison to make.
TranslateX86CC(ISD::CondCode SetCCOpcode,const SDLoc & DL,bool isFP,SDValue & LHS,SDValue & RHS,SelectionDAG & DAG)3976 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
3977 bool isFP, SDValue &LHS, SDValue &RHS,
3978 SelectionDAG &DAG) {
3979 if (!isFP) {
3980 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3981 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3982 // X > -1 -> X == 0, jump !sign.
3983 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3984 return X86::COND_NS;
3985 }
3986 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3987 // X < 0 -> X == 0, jump on sign.
3988 return X86::COND_S;
3989 }
3990 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3991 // X < 1 -> X <= 0
3992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3993 return X86::COND_LE;
3994 }
3995 }
3996
3997 return TranslateIntegerX86CC(SetCCOpcode);
3998 }
3999
4000 // First determine if it is required or is profitable to flip the operands.
4001
4002 // If LHS is a foldable load, but RHS is not, flip the condition.
4003 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4004 !ISD::isNON_EXTLoad(RHS.getNode())) {
4005 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4006 std::swap(LHS, RHS);
4007 }
4008
4009 switch (SetCCOpcode) {
4010 default: break;
4011 case ISD::SETOLT:
4012 case ISD::SETOLE:
4013 case ISD::SETUGT:
4014 case ISD::SETUGE:
4015 std::swap(LHS, RHS);
4016 break;
4017 }
4018
4019 // On a floating point condition, the flags are set as follows:
4020 // ZF PF CF op
4021 // 0 | 0 | 0 | X > Y
4022 // 0 | 0 | 1 | X < Y
4023 // 1 | 0 | 0 | X == Y
4024 // 1 | 1 | 1 | unordered
4025 switch (SetCCOpcode) {
4026 default: llvm_unreachable("Condcode should be pre-legalized away");
4027 case ISD::SETUEQ:
4028 case ISD::SETEQ: return X86::COND_E;
4029 case ISD::SETOLT: // flipped
4030 case ISD::SETOGT:
4031 case ISD::SETGT: return X86::COND_A;
4032 case ISD::SETOLE: // flipped
4033 case ISD::SETOGE:
4034 case ISD::SETGE: return X86::COND_AE;
4035 case ISD::SETUGT: // flipped
4036 case ISD::SETULT:
4037 case ISD::SETLT: return X86::COND_B;
4038 case ISD::SETUGE: // flipped
4039 case ISD::SETULE:
4040 case ISD::SETLE: return X86::COND_BE;
4041 case ISD::SETONE:
4042 case ISD::SETNE: return X86::COND_NE;
4043 case ISD::SETUO: return X86::COND_P;
4044 case ISD::SETO: return X86::COND_NP;
4045 case ISD::SETOEQ:
4046 case ISD::SETUNE: return X86::COND_INVALID;
4047 }
4048 }
4049
4050 /// Is there a floating point cmov for the specific X86 condition code?
4051 /// Current x86 isa includes the following FP cmov instructions:
4052 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
hasFPCMov(unsigned X86CC)4053 static bool hasFPCMov(unsigned X86CC) {
4054 switch (X86CC) {
4055 default:
4056 return false;
4057 case X86::COND_B:
4058 case X86::COND_BE:
4059 case X86::COND_E:
4060 case X86::COND_P:
4061 case X86::COND_A:
4062 case X86::COND_AE:
4063 case X86::COND_NE:
4064 case X86::COND_NP:
4065 return true;
4066 }
4067 }
4068
4069
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & I,unsigned Intrinsic) const4070 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4071 const CallInst &I,
4072 unsigned Intrinsic) const {
4073
4074 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4075 if (!IntrData)
4076 return false;
4077
4078 Info.opc = ISD::INTRINSIC_W_CHAIN;
4079 Info.readMem = false;
4080 Info.writeMem = false;
4081 Info.vol = false;
4082 Info.offset = 0;
4083
4084 switch (IntrData->Type) {
4085 case EXPAND_FROM_MEM: {
4086 Info.ptrVal = I.getArgOperand(0);
4087 Info.memVT = MVT::getVT(I.getType());
4088 Info.align = 1;
4089 Info.readMem = true;
4090 break;
4091 }
4092 case COMPRESS_TO_MEM: {
4093 Info.ptrVal = I.getArgOperand(0);
4094 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4095 Info.align = 1;
4096 Info.writeMem = true;
4097 break;
4098 }
4099 case TRUNCATE_TO_MEM_VI8:
4100 case TRUNCATE_TO_MEM_VI16:
4101 case TRUNCATE_TO_MEM_VI32: {
4102 Info.ptrVal = I.getArgOperand(0);
4103 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4104 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4105 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4106 ScalarVT = MVT::i8;
4107 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4108 ScalarVT = MVT::i16;
4109 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4110 ScalarVT = MVT::i32;
4111
4112 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4113 Info.align = 1;
4114 Info.writeMem = true;
4115 break;
4116 }
4117 default:
4118 return false;
4119 }
4120
4121 return true;
4122 }
4123
4124 /// Returns true if the target can instruction select the
4125 /// specified FP immediate natively. If false, the legalizer will
4126 /// materialize the FP immediate as a load from a constant pool.
isFPImmLegal(const APFloat & Imm,EVT VT) const4127 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4128 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4129 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4130 return true;
4131 }
4132 return false;
4133 }
4134
shouldReduceLoadWidth(SDNode * Load,ISD::LoadExtType ExtTy,EVT NewVT) const4135 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4136 ISD::LoadExtType ExtTy,
4137 EVT NewVT) const {
4138 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4139 // relocation target a movq or addq instruction: don't let the load shrink.
4140 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4141 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4142 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4143 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4144 return true;
4145 }
4146
4147 /// \brief Returns true if it is beneficial to convert a load of a constant
4148 /// to just the constant itself.
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const4149 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4150 Type *Ty) const {
4151 assert(Ty->isIntegerTy());
4152
4153 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4154 if (BitSize == 0 || BitSize > 64)
4155 return false;
4156 return true;
4157 }
4158
isExtractSubvectorCheap(EVT ResVT,unsigned Index) const4159 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4160 unsigned Index) const {
4161 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4162 return false;
4163
4164 return (Index == 0 || Index == ResVT.getVectorNumElements());
4165 }
4166
isCheapToSpeculateCttz() const4167 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4168 // Speculate cttz only if we can directly use TZCNT.
4169 return Subtarget.hasBMI();
4170 }
4171
isCheapToSpeculateCtlz() const4172 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4173 // Speculate ctlz only if we can directly use LZCNT.
4174 return Subtarget.hasLZCNT();
4175 }
4176
hasAndNotCompare(SDValue Y) const4177 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4178 if (!Subtarget.hasBMI())
4179 return false;
4180
4181 // There are only 32-bit and 64-bit forms for 'andn'.
4182 EVT VT = Y.getValueType();
4183 if (VT != MVT::i32 && VT != MVT::i64)
4184 return false;
4185
4186 return true;
4187 }
4188
4189 /// Return true if every element in Mask, beginning
4190 /// from position Pos and ending in Pos+Size is undef.
isUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size)4191 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4192 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4193 if (0 <= Mask[i])
4194 return false;
4195 return true;
4196 }
4197
4198 /// Return true if Val is undef or if its value falls within the
4199 /// specified range (L, H].
isUndefOrInRange(int Val,int Low,int Hi)4200 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4201 return (Val < 0) || (Val >= Low && Val < Hi);
4202 }
4203
4204 /// Return true if every element in Mask is undef or if its value
4205 /// falls within the specified range (L, H].
isUndefOrInRange(ArrayRef<int> Mask,int Low,int Hi)4206 static bool isUndefOrInRange(ArrayRef<int> Mask,
4207 int Low, int Hi) {
4208 for (int M : Mask)
4209 if (!isUndefOrInRange(M, Low, Hi))
4210 return false;
4211 return true;
4212 }
4213
4214 /// Val is either less than zero (undef) or equal to the specified value.
isUndefOrEqual(int Val,int CmpVal)4215 static bool isUndefOrEqual(int Val, int CmpVal) {
4216 return (Val < 0 || Val == CmpVal);
4217 }
4218
4219 /// Val is either the undef or zero sentinel value.
isUndefOrZero(int Val)4220 static bool isUndefOrZero(int Val) {
4221 return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
4222 }
4223
4224 /// Return true if every element in Mask, beginning
4225 /// from position Pos and ending in Pos+Size, falls within the specified
4226 /// sequential range (Low, Low+Size]. or is undef.
isSequentialOrUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low)4227 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4228 unsigned Pos, unsigned Size, int Low) {
4229 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4230 if (!isUndefOrEqual(Mask[i], Low))
4231 return false;
4232 return true;
4233 }
4234
4235 /// Return true if every element in Mask, beginning
4236 /// from position Pos and ending in Pos+Size, falls within the specified
4237 /// sequential range (Low, Low+Size], or is undef or is zero.
isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low)4238 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4239 unsigned Size, int Low) {
4240 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4241 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4242 return false;
4243 return true;
4244 }
4245
4246 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4247 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
isVEXTRACTIndex(SDNode * N,unsigned vecWidth)4248 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4249 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4250 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4251 return false;
4252
4253 // The index should be aligned on a vecWidth-bit boundary.
4254 uint64_t Index =
4255 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4256
4257 MVT VT = N->getSimpleValueType(0);
4258 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4259 bool Result = (Index * ElSize) % vecWidth == 0;
4260
4261 return Result;
4262 }
4263
4264 /// Return true if the specified INSERT_SUBVECTOR
4265 /// operand specifies a subvector insert that is suitable for input to
4266 /// insertion of 128 or 256-bit subvectors
isVINSERTIndex(SDNode * N,unsigned vecWidth)4267 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4268 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4269 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4270 return false;
4271 // The index should be aligned on a vecWidth-bit boundary.
4272 uint64_t Index =
4273 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4274
4275 MVT VT = N->getSimpleValueType(0);
4276 unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4277 bool Result = (Index * ElSize) % vecWidth == 0;
4278
4279 return Result;
4280 }
4281
isVINSERT128Index(SDNode * N)4282 bool X86::isVINSERT128Index(SDNode *N) {
4283 return isVINSERTIndex(N, 128);
4284 }
4285
isVINSERT256Index(SDNode * N)4286 bool X86::isVINSERT256Index(SDNode *N) {
4287 return isVINSERTIndex(N, 256);
4288 }
4289
isVEXTRACT128Index(SDNode * N)4290 bool X86::isVEXTRACT128Index(SDNode *N) {
4291 return isVEXTRACTIndex(N, 128);
4292 }
4293
isVEXTRACT256Index(SDNode * N)4294 bool X86::isVEXTRACT256Index(SDNode *N) {
4295 return isVEXTRACTIndex(N, 256);
4296 }
4297
getExtractVEXTRACTImmediate(SDNode * N,unsigned vecWidth)4298 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4299 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4300 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4301 "Illegal extract subvector for VEXTRACT");
4302
4303 uint64_t Index =
4304 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4305
4306 MVT VecVT = N->getOperand(0).getSimpleValueType();
4307 MVT ElVT = VecVT.getVectorElementType();
4308
4309 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4310 return Index / NumElemsPerChunk;
4311 }
4312
getInsertVINSERTImmediate(SDNode * N,unsigned vecWidth)4313 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4314 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4315 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4316 "Illegal insert subvector for VINSERT");
4317
4318 uint64_t Index =
4319 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4320
4321 MVT VecVT = N->getSimpleValueType(0);
4322 MVT ElVT = VecVT.getVectorElementType();
4323
4324 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4325 return Index / NumElemsPerChunk;
4326 }
4327
4328 /// Return the appropriate immediate to extract the specified
4329 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
getExtractVEXTRACT128Immediate(SDNode * N)4330 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4331 return getExtractVEXTRACTImmediate(N, 128);
4332 }
4333
4334 /// Return the appropriate immediate to extract the specified
4335 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
getExtractVEXTRACT256Immediate(SDNode * N)4336 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4337 return getExtractVEXTRACTImmediate(N, 256);
4338 }
4339
4340 /// Return the appropriate immediate to insert at the specified
4341 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
getInsertVINSERT128Immediate(SDNode * N)4342 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4343 return getInsertVINSERTImmediate(N, 128);
4344 }
4345
4346 /// Return the appropriate immediate to insert at the specified
4347 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
getInsertVINSERT256Immediate(SDNode * N)4348 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4349 return getInsertVINSERTImmediate(N, 256);
4350 }
4351
4352 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
isZeroNode(SDValue Elt)4353 bool X86::isZeroNode(SDValue Elt) {
4354 return isNullConstant(Elt) || isNullFPConstant(Elt);
4355 }
4356
4357 // Build a vector of constants
4358 // Use an UNDEF node if MaskElt == -1.
4359 // Spilt 64-bit constants in the 32-bit mode.
getConstVector(ArrayRef<int> Values,MVT VT,SelectionDAG & DAG,const SDLoc & dl,bool IsMask=false)4360 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4361 const SDLoc &dl, bool IsMask = false) {
4362
4363 SmallVector<SDValue, 32> Ops;
4364 bool Split = false;
4365
4366 MVT ConstVecVT = VT;
4367 unsigned NumElts = VT.getVectorNumElements();
4368 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4369 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4370 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4371 Split = true;
4372 }
4373
4374 MVT EltVT = ConstVecVT.getVectorElementType();
4375 for (unsigned i = 0; i < NumElts; ++i) {
4376 bool IsUndef = Values[i] < 0 && IsMask;
4377 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4378 DAG.getConstant(Values[i], dl, EltVT);
4379 Ops.push_back(OpNode);
4380 if (Split)
4381 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4382 DAG.getConstant(0, dl, EltVT));
4383 }
4384 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4385 if (Split)
4386 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4387 return ConstsNode;
4388 }
4389
4390 /// Returns a vector of specified type with all zero elements.
getZeroVector(MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)4391 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4392 SelectionDAG &DAG, const SDLoc &dl) {
4393 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4394 VT.getVectorElementType() == MVT::i1) &&
4395 "Unexpected vector type");
4396
4397 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4398 // type. This ensures they get CSE'd. But if the integer type is not
4399 // available, use a floating-point +0.0 instead.
4400 SDValue Vec;
4401 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4402 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4403 } else if (VT.getVectorElementType() == MVT::i1) {
4404 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4405 "Unexpected vector type");
4406 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4407 "Unexpected vector type");
4408 Vec = DAG.getConstant(0, dl, VT);
4409 } else {
4410 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4411 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4412 }
4413 return DAG.getBitcast(VT, Vec);
4414 }
4415
extractSubVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)4416 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4417 const SDLoc &dl, unsigned vectorWidth) {
4418 assert((vectorWidth == 128 || vectorWidth == 256) &&
4419 "Unsupported vector width");
4420 EVT VT = Vec.getValueType();
4421 EVT ElVT = VT.getVectorElementType();
4422 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4423 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4424 VT.getVectorNumElements()/Factor);
4425
4426 // Extract from UNDEF is UNDEF.
4427 if (Vec.isUndef())
4428 return DAG.getUNDEF(ResultVT);
4429
4430 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4431 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4432 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4433
4434 // This is the index of the first element of the vectorWidth-bit chunk
4435 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4436 IdxVal &= ~(ElemsPerChunk - 1);
4437
4438 // If the input is a buildvector just emit a smaller one.
4439 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4440 return DAG.getNode(ISD::BUILD_VECTOR,
4441 dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4442
4443 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4444 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4445 }
4446
4447 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4448 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4449 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4450 /// instructions or a simple subregister reference. Idx is an index in the
4451 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4452 /// lowering EXTRACT_VECTOR_ELT operations easier.
extract128BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4453 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4454 SelectionDAG &DAG, const SDLoc &dl) {
4455 assert((Vec.getValueType().is256BitVector() ||
4456 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4457 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4458 }
4459
4460 /// Generate a DAG to grab 256-bits from a 512-bit vector.
extract256BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4461 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4462 SelectionDAG &DAG, const SDLoc &dl) {
4463 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4464 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4465 }
4466
insertSubVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)4467 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4468 SelectionDAG &DAG, const SDLoc &dl,
4469 unsigned vectorWidth) {
4470 assert((vectorWidth == 128 || vectorWidth == 256) &&
4471 "Unsupported vector width");
4472 // Inserting UNDEF is Result
4473 if (Vec.isUndef())
4474 return Result;
4475 EVT VT = Vec.getValueType();
4476 EVT ElVT = VT.getVectorElementType();
4477 EVT ResultVT = Result.getValueType();
4478
4479 // Insert the relevant vectorWidth bits.
4480 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4481 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4482
4483 // This is the index of the first element of the vectorWidth-bit chunk
4484 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4485 IdxVal &= ~(ElemsPerChunk - 1);
4486
4487 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4488 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4489 }
4490
4491 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4492 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4493 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4494 /// simple superregister reference. Idx is an index in the 128 bits
4495 /// we want. It need not be aligned to a 128-bit boundary. That makes
4496 /// lowering INSERT_VECTOR_ELT operations easier.
insert128BitVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4497 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4498 SelectionDAG &DAG, const SDLoc &dl) {
4499 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4500
4501 // For insertion into the zero index (low half) of a 256-bit vector, it is
4502 // more efficient to generate a blend with immediate instead of an insert*128.
4503 // We are still creating an INSERT_SUBVECTOR below with an undef node to
4504 // extend the subvector to the size of the result vector. Make sure that
4505 // we are not recursing on that node by checking for undef here.
4506 if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4507 !Result.isUndef()) {
4508 EVT ResultVT = Result.getValueType();
4509 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4510 SDValue Undef = DAG.getUNDEF(ResultVT);
4511 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4512 Vec, ZeroIndex);
4513
4514 // The blend instruction, and therefore its mask, depend on the data type.
4515 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4516 if (ScalarType.isFloatingPoint()) {
4517 // Choose either vblendps (float) or vblendpd (double).
4518 unsigned ScalarSize = ScalarType.getSizeInBits();
4519 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4520 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4521 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4522 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4523 }
4524
4525 const X86Subtarget &Subtarget =
4526 static_cast<const X86Subtarget &>(DAG.getSubtarget());
4527
4528 // AVX2 is needed for 256-bit integer blend support.
4529 // Integers must be cast to 32-bit because there is only vpblendd;
4530 // vpblendw can't be used for this because it has a handicapped mask.
4531
4532 // If we don't have AVX2, then cast to float. Using a wrong domain blend
4533 // is still more efficient than using the wrong domain vinsertf128 that
4534 // will be created by InsertSubVector().
4535 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4536
4537 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4538 Result = DAG.getBitcast(CastVT, Result);
4539 Vec256 = DAG.getBitcast(CastVT, Vec256);
4540 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4541 return DAG.getBitcast(ResultVT, Vec256);
4542 }
4543
4544 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4545 }
4546
insert256BitVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4547 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4548 SelectionDAG &DAG, const SDLoc &dl) {
4549 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4550 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4551 }
4552
4553 /// Insert i1-subvector to i1-vector.
insert1BitVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)4554 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4555 const X86Subtarget &Subtarget) {
4556
4557 SDLoc dl(Op);
4558 SDValue Vec = Op.getOperand(0);
4559 SDValue SubVec = Op.getOperand(1);
4560 SDValue Idx = Op.getOperand(2);
4561
4562 if (!isa<ConstantSDNode>(Idx))
4563 return SDValue();
4564
4565 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4566 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4567 return Op;
4568
4569 MVT OpVT = Op.getSimpleValueType();
4570 MVT SubVecVT = SubVec.getSimpleValueType();
4571 unsigned NumElems = OpVT.getVectorNumElements();
4572 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4573
4574 assert(IdxVal + SubVecNumElems <= NumElems &&
4575 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4576 "Unexpected index value in INSERT_SUBVECTOR");
4577
4578 // There are 3 possible cases:
4579 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
4580 // 2. Subvector should be inserted in the upper part
4581 // (IdxVal + SubVecNumElems == NumElems)
4582 // 3. Subvector should be inserted in the middle (for example v2i1
4583 // to v16i1, index 2)
4584
4585 // extend to natively supported kshift
4586 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4587 MVT WideOpVT = OpVT;
4588 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
4589 WideOpVT = MinVT;
4590
4591 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4592 SDValue Undef = DAG.getUNDEF(WideOpVT);
4593 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4594 Undef, SubVec, ZeroIdx);
4595
4596 // Extract sub-vector if require.
4597 auto ExtractSubVec = [&](SDValue V) {
4598 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
4599 OpVT, V, ZeroIdx);
4600 };
4601
4602 if (Vec.isUndef()) {
4603 if (IdxVal != 0) {
4604 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
4605 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
4606 }
4607 return ExtractSubVec(WideSubVec);
4608 }
4609
4610 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
4611 NumElems = WideOpVT.getVectorNumElements();
4612 unsigned ShiftLeft = NumElems - SubVecNumElems;
4613 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4614 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4615 DAG.getConstant(ShiftLeft, dl, MVT::i8));
4616 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
4617 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
4618 return ExtractSubVec(Vec);
4619 }
4620
4621 if (IdxVal == 0) {
4622 // Zero lower bits of the Vec
4623 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4624 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4625 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4626 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4627 // Merge them together, SubVec should be zero extended.
4628 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4629 getZeroVector(WideOpVT, Subtarget, DAG, dl),
4630 SubVec, ZeroIdx);
4631 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4632 return ExtractSubVec(Vec);
4633 }
4634
4635 // Simple case when we put subvector in the upper part
4636 if (IdxVal + SubVecNumElems == NumElems) {
4637 // Zero upper bits of the Vec
4638 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4639 DAG.getConstant(IdxVal, dl, MVT::i8));
4640 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4641 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4642 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4643 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4644 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4645 return ExtractSubVec(Vec);
4646 }
4647 // Subvector should be inserted in the middle - use shuffle
4648 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
4649 SubVec, ZeroIdx);
4650 SmallVector<int, 64> Mask;
4651 for (unsigned i = 0; i < NumElems; ++i)
4652 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
4653 i : i + NumElems);
4654 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
4655 }
4656
4657 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
4658 /// instructions. This is used because creating CONCAT_VECTOR nodes of
4659 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
4660 /// large BUILD_VECTORS.
concat128BitVectors(SDValue V1,SDValue V2,EVT VT,unsigned NumElems,SelectionDAG & DAG,const SDLoc & dl)4661 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
4662 unsigned NumElems, SelectionDAG &DAG,
4663 const SDLoc &dl) {
4664 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4665 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
4666 }
4667
concat256BitVectors(SDValue V1,SDValue V2,EVT VT,unsigned NumElems,SelectionDAG & DAG,const SDLoc & dl)4668 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
4669 unsigned NumElems, SelectionDAG &DAG,
4670 const SDLoc &dl) {
4671 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4672 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
4673 }
4674
4675 /// Returns a vector of specified type with all bits set.
4676 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4677 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
4678 /// Then bitcast to their original type, ensuring they get CSE'd.
getOnesVector(EVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)4679 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
4680 SelectionDAG &DAG, const SDLoc &dl) {
4681 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4682 "Expected a 128/256/512-bit vector type");
4683
4684 APInt Ones = APInt::getAllOnesValue(32);
4685 unsigned NumElts = VT.getSizeInBits() / 32;
4686 SDValue Vec;
4687 if (!Subtarget.hasInt256() && NumElts == 8) {
4688 Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
4689 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4690 } else {
4691 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4692 }
4693 return DAG.getBitcast(VT, Vec);
4694 }
4695
4696 /// Returns a vector_shuffle node for an unpackl operation.
getUnpackl(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)4697 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4698 SDValue V1, SDValue V2) {
4699 assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4700 unsigned NumElems = VT.getVectorNumElements();
4701 SmallVector<int, 8> Mask(NumElems);
4702 for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4703 Mask[i * 2] = i;
4704 Mask[i * 2 + 1] = i + NumElems;
4705 }
4706 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4707 }
4708
4709 /// Returns a vector_shuffle node for an unpackh operation.
getUnpackh(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)4710 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4711 SDValue V1, SDValue V2) {
4712 assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4713 unsigned NumElems = VT.getVectorNumElements();
4714 SmallVector<int, 8> Mask(NumElems);
4715 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4716 Mask[i * 2] = i + Half;
4717 Mask[i * 2 + 1] = i + NumElems + Half;
4718 }
4719 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4720 }
4721
4722 /// Return a vector_shuffle of the specified vector of zero or undef vector.
4723 /// This produces a shuffle where the low element of V2 is swizzled into the
4724 /// zero/undef vector, landing at element Idx.
4725 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
getShuffleVectorZeroOrUndef(SDValue V2,int Idx,bool IsZero,const X86Subtarget & Subtarget,SelectionDAG & DAG)4726 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
4727 bool IsZero,
4728 const X86Subtarget &Subtarget,
4729 SelectionDAG &DAG) {
4730 MVT VT = V2.getSimpleValueType();
4731 SDValue V1 = IsZero
4732 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4733 int NumElems = VT.getVectorNumElements();
4734 SmallVector<int, 16> MaskVec(NumElems);
4735 for (int i = 0; i != NumElems; ++i)
4736 // If this is the insertion idx, put the low elt of V2 here.
4737 MaskVec[i] = (i == Idx) ? NumElems : i;
4738 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4739 }
4740
peekThroughBitcasts(SDValue V)4741 static SDValue peekThroughBitcasts(SDValue V) {
4742 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
4743 V = V.getOperand(0);
4744 return V;
4745 }
4746
getTargetShuffleMaskIndices(SDValue MaskNode,unsigned MaskEltSizeInBits,SmallVectorImpl<uint64_t> & RawMask)4747 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
4748 unsigned MaskEltSizeInBits,
4749 SmallVectorImpl<uint64_t> &RawMask) {
4750 MaskNode = peekThroughBitcasts(MaskNode);
4751
4752 MVT VT = MaskNode.getSimpleValueType();
4753 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
4754
4755 // Split an APInt element into MaskEltSizeInBits sized pieces and
4756 // insert into the shuffle mask.
4757 auto SplitElementToMask = [&](APInt Element) {
4758 // Note that this is x86 and so always little endian: the low byte is
4759 // the first byte of the mask.
4760 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4761 for (int i = 0; i < Split; ++i) {
4762 APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
4763 Element = Element.lshr(MaskEltSizeInBits);
4764 RawMask.push_back(RawElt.getZExtValue());
4765 }
4766 };
4767
4768 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
4769 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4770 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
4771 if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
4772 return false;
4773 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
4774 const APInt &MaskElement = CN->getAPIntValue();
4775 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
4776 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
4777 RawMask.push_back(RawElt.getZExtValue());
4778 }
4779 }
4780 return false;
4781 }
4782
4783 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
4784 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
4785
4786 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4787 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4788 return false;
4789 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4790
4791 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
4792 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
4793 SplitElementToMask(CN->getAPIntValue());
4794 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
4795 return true;
4796 }
4797 return false;
4798 }
4799
4800 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
4801 return false;
4802
4803 // We can always decode if the buildvector is all zero constants,
4804 // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
4805 if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
4806 RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
4807 return true;
4808 }
4809
4810 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4811 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4812 return false;
4813
4814 for (SDValue Op : MaskNode->ops()) {
4815 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
4816 SplitElementToMask(CN->getAPIntValue());
4817 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
4818 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
4819 else
4820 return false;
4821 }
4822
4823 return true;
4824 }
4825
getTargetShuffleMaskConstant(SDValue MaskNode)4826 static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
4827 MaskNode = peekThroughBitcasts(MaskNode);
4828
4829 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
4830 if (!MaskLoad)
4831 return nullptr;
4832
4833 SDValue Ptr = MaskLoad->getBasePtr();
4834 if (Ptr->getOpcode() == X86ISD::Wrapper ||
4835 Ptr->getOpcode() == X86ISD::WrapperRIP)
4836 Ptr = Ptr->getOperand(0);
4837
4838 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
4839 if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
4840 return nullptr;
4841
4842 return dyn_cast<Constant>(MaskCP->getConstVal());
4843 }
4844
4845 /// Calculates the shuffle mask corresponding to the target-specific opcode.
4846 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
4847 /// operands in \p Ops, and returns true.
4848 /// Sets \p IsUnary to true if only one source is used. Note that this will set
4849 /// IsUnary for shuffles which use a single input multiple times, and in those
4850 /// cases it will adjust the mask to only have indices within that single input.
4851 /// It is an error to call this with non-empty Mask/Ops vectors.
getTargetShuffleMask(SDNode * N,MVT VT,bool AllowSentinelZero,SmallVectorImpl<SDValue> & Ops,SmallVectorImpl<int> & Mask,bool & IsUnary)4852 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
4853 SmallVectorImpl<SDValue> &Ops,
4854 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4855 unsigned NumElems = VT.getVectorNumElements();
4856 SDValue ImmN;
4857
4858 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
4859 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
4860
4861 IsUnary = false;
4862 bool IsFakeUnary = false;
4863 switch(N->getOpcode()) {
4864 case X86ISD::BLENDI:
4865 ImmN = N->getOperand(N->getNumOperands()-1);
4866 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4867 break;
4868 case X86ISD::SHUFP:
4869 ImmN = N->getOperand(N->getNumOperands()-1);
4870 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4871 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4872 break;
4873 case X86ISD::INSERTPS:
4874 ImmN = N->getOperand(N->getNumOperands()-1);
4875 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4876 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4877 break;
4878 case X86ISD::UNPCKH:
4879 DecodeUNPCKHMask(VT, Mask);
4880 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4881 break;
4882 case X86ISD::UNPCKL:
4883 DecodeUNPCKLMask(VT, Mask);
4884 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4885 break;
4886 case X86ISD::MOVHLPS:
4887 DecodeMOVHLPSMask(NumElems, Mask);
4888 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4889 break;
4890 case X86ISD::MOVLHPS:
4891 DecodeMOVLHPSMask(NumElems, Mask);
4892 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4893 break;
4894 case X86ISD::PALIGNR:
4895 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4896 ImmN = N->getOperand(N->getNumOperands()-1);
4897 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4898 break;
4899 case X86ISD::VSHLDQ:
4900 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4901 ImmN = N->getOperand(N->getNumOperands() - 1);
4902 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4903 IsUnary = true;
4904 break;
4905 case X86ISD::VSRLDQ:
4906 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4907 ImmN = N->getOperand(N->getNumOperands() - 1);
4908 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4909 IsUnary = true;
4910 break;
4911 case X86ISD::PSHUFD:
4912 case X86ISD::VPERMILPI:
4913 ImmN = N->getOperand(N->getNumOperands()-1);
4914 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4915 IsUnary = true;
4916 break;
4917 case X86ISD::PSHUFHW:
4918 ImmN = N->getOperand(N->getNumOperands()-1);
4919 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4920 IsUnary = true;
4921 break;
4922 case X86ISD::PSHUFLW:
4923 ImmN = N->getOperand(N->getNumOperands()-1);
4924 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4925 IsUnary = true;
4926 break;
4927 case X86ISD::VZEXT_MOVL:
4928 DecodeZeroMoveLowMask(VT, Mask);
4929 IsUnary = true;
4930 break;
4931 case X86ISD::VPERMILPV: {
4932 IsUnary = true;
4933 SDValue MaskNode = N->getOperand(1);
4934 unsigned MaskEltSize = VT.getScalarSizeInBits();
4935 SmallVector<uint64_t, 32> RawMask;
4936 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
4937 DecodeVPERMILPMask(VT, RawMask, Mask);
4938 break;
4939 }
4940 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4941 DecodeVPERMILPMask(C, MaskEltSize, Mask);
4942 break;
4943 }
4944 return false;
4945 }
4946 case X86ISD::PSHUFB: {
4947 IsUnary = true;
4948 SDValue MaskNode = N->getOperand(1);
4949 SmallVector<uint64_t, 32> RawMask;
4950 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
4951 DecodePSHUFBMask(RawMask, Mask);
4952 break;
4953 }
4954 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4955 DecodePSHUFBMask(C, Mask);
4956 break;
4957 }
4958 return false;
4959 }
4960 case X86ISD::VPERMI:
4961 ImmN = N->getOperand(N->getNumOperands()-1);
4962 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4963 IsUnary = true;
4964 break;
4965 case X86ISD::MOVSS:
4966 case X86ISD::MOVSD:
4967 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
4968 break;
4969 case X86ISD::VPERM2X128:
4970 ImmN = N->getOperand(N->getNumOperands()-1);
4971 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4972 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4973 break;
4974 case X86ISD::MOVSLDUP:
4975 DecodeMOVSLDUPMask(VT, Mask);
4976 IsUnary = true;
4977 break;
4978 case X86ISD::MOVSHDUP:
4979 DecodeMOVSHDUPMask(VT, Mask);
4980 IsUnary = true;
4981 break;
4982 case X86ISD::MOVDDUP:
4983 DecodeMOVDDUPMask(VT, Mask);
4984 IsUnary = true;
4985 break;
4986 case X86ISD::MOVLHPD:
4987 case X86ISD::MOVLPD:
4988 case X86ISD::MOVLPS:
4989 // Not yet implemented
4990 return false;
4991 case X86ISD::VPERMIL2: {
4992 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4993 unsigned MaskEltSize = VT.getScalarSizeInBits();
4994 SDValue MaskNode = N->getOperand(2);
4995 SDValue CtrlNode = N->getOperand(3);
4996 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
4997 unsigned CtrlImm = CtrlOp->getZExtValue();
4998 SmallVector<uint64_t, 32> RawMask;
4999 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5000 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5001 break;
5002 }
5003 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5004 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5005 break;
5006 }
5007 }
5008 return false;
5009 }
5010 case X86ISD::VPPERM: {
5011 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5012 SDValue MaskNode = N->getOperand(2);
5013 SmallVector<uint64_t, 32> RawMask;
5014 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5015 DecodeVPPERMMask(RawMask, Mask);
5016 break;
5017 }
5018 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5019 DecodeVPPERMMask(C, Mask);
5020 break;
5021 }
5022 return false;
5023 }
5024 case X86ISD::VPERMV: {
5025 IsUnary = true;
5026 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5027 Ops.push_back(N->getOperand(1));
5028 SDValue MaskNode = N->getOperand(0);
5029 SmallVector<uint64_t, 32> RawMask;
5030 unsigned MaskEltSize = VT.getScalarSizeInBits();
5031 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5032 DecodeVPERMVMask(RawMask, Mask);
5033 break;
5034 }
5035 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5036 DecodeVPERMVMask(C, VT, Mask);
5037 break;
5038 }
5039 return false;
5040 }
5041 case X86ISD::VPERMV3: {
5042 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5043 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5044 Ops.push_back(N->getOperand(0));
5045 Ops.push_back(N->getOperand(2));
5046 SDValue MaskNode = N->getOperand(1);
5047 if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5048 DecodeVPERMV3Mask(C, VT, Mask);
5049 break;
5050 }
5051 return false;
5052 }
5053 default: llvm_unreachable("unknown target shuffle node");
5054 }
5055
5056 // Empty mask indicates the decode failed.
5057 if (Mask.empty())
5058 return false;
5059
5060 // Check if we're getting a shuffle mask with zero'd elements.
5061 if (!AllowSentinelZero)
5062 if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5063 return false;
5064
5065 // If we have a fake unary shuffle, the shuffle mask is spread across two
5066 // inputs that are actually the same node. Re-map the mask to always point
5067 // into the first input.
5068 if (IsFakeUnary)
5069 for (int &M : Mask)
5070 if (M >= (int)Mask.size())
5071 M -= Mask.size();
5072
5073 // If we didn't already add operands in the opcode-specific code, default to
5074 // adding 1 or 2 operands starting at 0.
5075 if (Ops.empty()) {
5076 Ops.push_back(N->getOperand(0));
5077 if (!IsUnary || IsFakeUnary)
5078 Ops.push_back(N->getOperand(1));
5079 }
5080
5081 return true;
5082 }
5083
5084 /// Check a target shuffle mask's inputs to see if we can set any values to
5085 /// SM_SentinelZero - this is for elements that are known to be zero
5086 /// (not just zeroable) from their inputs.
5087 /// Returns true if the target shuffle mask was decoded.
setTargetShuffleZeroElements(SDValue N,SmallVectorImpl<int> & Mask,SmallVectorImpl<SDValue> & Ops)5088 static bool setTargetShuffleZeroElements(SDValue N,
5089 SmallVectorImpl<int> &Mask,
5090 SmallVectorImpl<SDValue> &Ops) {
5091 bool IsUnary;
5092 if (!isTargetShuffle(N.getOpcode()))
5093 return false;
5094 if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
5095 Mask, IsUnary))
5096 return false;
5097
5098 SDValue V1 = Ops[0];
5099 SDValue V2 = IsUnary ? V1 : Ops[1];
5100
5101 V1 = peekThroughBitcasts(V1);
5102 V2 = peekThroughBitcasts(V2);
5103
5104 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5105 int M = Mask[i];
5106
5107 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5108 if (M < 0)
5109 continue;
5110
5111 // Determine shuffle input and normalize the mask.
5112 SDValue V = M < Size ? V1 : V2;
5113 M %= Size;
5114
5115 // We are referencing an UNDEF input.
5116 if (V.isUndef()) {
5117 Mask[i] = SM_SentinelUndef;
5118 continue;
5119 }
5120
5121 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5122 if (V.getOpcode() != ISD::BUILD_VECTOR)
5123 continue;
5124
5125 // If the BUILD_VECTOR has fewer elements then the (larger) source
5126 // element must be UNDEF/ZERO.
5127 // TODO: Is it worth testing the individual bits of a constant?
5128 if ((Size % V.getNumOperands()) == 0) {
5129 int Scale = Size / V->getNumOperands();
5130 SDValue Op = V.getOperand(M / Scale);
5131 if (Op.isUndef())
5132 Mask[i] = SM_SentinelUndef;
5133 else if (X86::isZeroNode(Op))
5134 Mask[i] = SM_SentinelZero;
5135 continue;
5136 }
5137
5138 // If the BUILD_VECTOR has more elements then all the (smaller) source
5139 // elements must be all UNDEF or all ZERO.
5140 if ((V.getNumOperands() % Size) == 0) {
5141 int Scale = V->getNumOperands() / Size;
5142 bool AllUndef = true;
5143 bool AllZero = true;
5144 for (int j = 0; j < Scale; ++j) {
5145 SDValue Op = V.getOperand((M * Scale) + j);
5146 AllUndef &= Op.isUndef();
5147 AllZero &= X86::isZeroNode(Op);
5148 }
5149 if (AllUndef)
5150 Mask[i] = SM_SentinelUndef;
5151 else if (AllZero)
5152 Mask[i] = SM_SentinelZero;
5153 continue;
5154 }
5155 }
5156
5157 return true;
5158 }
5159
5160 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5161 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5162 /// remaining input indices in case we now have a unary shuffle and adjust the
5163 /// Op0/Op1 inputs accordingly.
5164 /// Returns true if the target shuffle mask was decoded.
resolveTargetShuffleInputs(SDValue Op,SDValue & Op0,SDValue & Op1,SmallVectorImpl<int> & Mask)5165 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5166 SmallVectorImpl<int> &Mask) {
5167 SmallVector<SDValue, 2> Ops;
5168 if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5169 return false;
5170
5171 int NumElts = Mask.size();
5172 bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
5173 return 0 <= Idx && Idx < NumElts;
5174 });
5175 bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
5176 [NumElts](int Idx) { return NumElts <= Idx; });
5177
5178 Op0 = Op0InUse ? Ops[0] : SDValue();
5179 Op1 = Op1InUse ? Ops[1] : SDValue();
5180
5181 // We're only using Op1 - commute the mask and inputs.
5182 if (!Op0InUse && Op1InUse) {
5183 for (int &M : Mask)
5184 if (NumElts <= M)
5185 M -= NumElts;
5186 Op0 = Op1;
5187 Op1 = SDValue();
5188 }
5189
5190 return true;
5191 }
5192
5193 /// Returns the scalar element that will make up the ith
5194 /// element of the result of the vector shuffle.
getShuffleScalarElt(SDNode * N,unsigned Index,SelectionDAG & DAG,unsigned Depth)5195 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5196 unsigned Depth) {
5197 if (Depth == 6)
5198 return SDValue(); // Limit search depth.
5199
5200 SDValue V = SDValue(N, 0);
5201 EVT VT = V.getValueType();
5202 unsigned Opcode = V.getOpcode();
5203
5204 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5205 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5206 int Elt = SV->getMaskElt(Index);
5207
5208 if (Elt < 0)
5209 return DAG.getUNDEF(VT.getVectorElementType());
5210
5211 unsigned NumElems = VT.getVectorNumElements();
5212 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5213 : SV->getOperand(1);
5214 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5215 }
5216
5217 // Recurse into target specific vector shuffles to find scalars.
5218 if (isTargetShuffle(Opcode)) {
5219 MVT ShufVT = V.getSimpleValueType();
5220 MVT ShufSVT = ShufVT.getVectorElementType();
5221 int NumElems = (int)ShufVT.getVectorNumElements();
5222 SmallVector<int, 16> ShuffleMask;
5223 SmallVector<SDValue, 16> ShuffleOps;
5224 bool IsUnary;
5225
5226 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5227 return SDValue();
5228
5229 int Elt = ShuffleMask[Index];
5230 if (Elt == SM_SentinelZero)
5231 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5232 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5233 if (Elt == SM_SentinelUndef)
5234 return DAG.getUNDEF(ShufSVT);
5235
5236 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5237 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5238 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5239 Depth+1);
5240 }
5241
5242 // Actual nodes that may contain scalar elements
5243 if (Opcode == ISD::BITCAST) {
5244 V = V.getOperand(0);
5245 EVT SrcVT = V.getValueType();
5246 unsigned NumElems = VT.getVectorNumElements();
5247
5248 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5249 return SDValue();
5250 }
5251
5252 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5253 return (Index == 0) ? V.getOperand(0)
5254 : DAG.getUNDEF(VT.getVectorElementType());
5255
5256 if (V.getOpcode() == ISD::BUILD_VECTOR)
5257 return V.getOperand(Index);
5258
5259 return SDValue();
5260 }
5261
5262 /// Custom lower build_vector of v16i8.
LowerBuildVectorv16i8(SDValue Op,unsigned NonZeros,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget,const TargetLowering & TLI)5263 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5264 unsigned NumNonZero, unsigned NumZero,
5265 SelectionDAG &DAG,
5266 const X86Subtarget &Subtarget,
5267 const TargetLowering &TLI) {
5268 if (NumNonZero > 8)
5269 return SDValue();
5270
5271 SDLoc dl(Op);
5272 SDValue V;
5273 bool First = true;
5274
5275 // SSE4.1 - use PINSRB to insert each byte directly.
5276 if (Subtarget.hasSSE41()) {
5277 for (unsigned i = 0; i < 16; ++i) {
5278 bool isNonZero = (NonZeros & (1 << i)) != 0;
5279 if (isNonZero) {
5280 if (First) {
5281 if (NumZero)
5282 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5283 else
5284 V = DAG.getUNDEF(MVT::v16i8);
5285 First = false;
5286 }
5287 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5288 MVT::v16i8, V, Op.getOperand(i),
5289 DAG.getIntPtrConstant(i, dl));
5290 }
5291 }
5292
5293 return V;
5294 }
5295
5296 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5297 for (unsigned i = 0; i < 16; ++i) {
5298 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5299 if (ThisIsNonZero && First) {
5300 if (NumZero)
5301 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5302 else
5303 V = DAG.getUNDEF(MVT::v8i16);
5304 First = false;
5305 }
5306
5307 if ((i & 1) != 0) {
5308 SDValue ThisElt, LastElt;
5309 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5310 if (LastIsNonZero) {
5311 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5312 MVT::i16, Op.getOperand(i-1));
5313 }
5314 if (ThisIsNonZero) {
5315 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5316 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5317 ThisElt, DAG.getConstant(8, dl, MVT::i8));
5318 if (LastIsNonZero)
5319 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5320 } else
5321 ThisElt = LastElt;
5322
5323 if (ThisElt.getNode())
5324 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5325 DAG.getIntPtrConstant(i/2, dl));
5326 }
5327 }
5328
5329 return DAG.getBitcast(MVT::v16i8, V);
5330 }
5331
5332 /// Custom lower build_vector of v8i16.
LowerBuildVectorv8i16(SDValue Op,unsigned NonZeros,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget,const TargetLowering & TLI)5333 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5334 unsigned NumNonZero, unsigned NumZero,
5335 SelectionDAG &DAG,
5336 const X86Subtarget &Subtarget,
5337 const TargetLowering &TLI) {
5338 if (NumNonZero > 4)
5339 return SDValue();
5340
5341 SDLoc dl(Op);
5342 SDValue V;
5343 bool First = true;
5344 for (unsigned i = 0; i < 8; ++i) {
5345 bool isNonZero = (NonZeros & (1 << i)) != 0;
5346 if (isNonZero) {
5347 if (First) {
5348 if (NumZero)
5349 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5350 else
5351 V = DAG.getUNDEF(MVT::v8i16);
5352 First = false;
5353 }
5354 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5355 MVT::v8i16, V, Op.getOperand(i),
5356 DAG.getIntPtrConstant(i, dl));
5357 }
5358 }
5359
5360 return V;
5361 }
5362
5363 /// Custom lower build_vector of v4i32 or v4f32.
LowerBuildVectorv4x32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget,const TargetLowering & TLI)5364 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5365 const X86Subtarget &Subtarget,
5366 const TargetLowering &TLI) {
5367 // Find all zeroable elements.
5368 std::bitset<4> Zeroable;
5369 for (int i=0; i < 4; ++i) {
5370 SDValue Elt = Op->getOperand(i);
5371 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
5372 }
5373 assert(Zeroable.size() - Zeroable.count() > 1 &&
5374 "We expect at least two non-zero elements!");
5375
5376 // We only know how to deal with build_vector nodes where elements are either
5377 // zeroable or extract_vector_elt with constant index.
5378 SDValue FirstNonZero;
5379 unsigned FirstNonZeroIdx;
5380 for (unsigned i=0; i < 4; ++i) {
5381 if (Zeroable[i])
5382 continue;
5383 SDValue Elt = Op->getOperand(i);
5384 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5385 !isa<ConstantSDNode>(Elt.getOperand(1)))
5386 return SDValue();
5387 // Make sure that this node is extracting from a 128-bit vector.
5388 MVT VT = Elt.getOperand(0).getSimpleValueType();
5389 if (!VT.is128BitVector())
5390 return SDValue();
5391 if (!FirstNonZero.getNode()) {
5392 FirstNonZero = Elt;
5393 FirstNonZeroIdx = i;
5394 }
5395 }
5396
5397 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5398 SDValue V1 = FirstNonZero.getOperand(0);
5399 MVT VT = V1.getSimpleValueType();
5400
5401 // See if this build_vector can be lowered as a blend with zero.
5402 SDValue Elt;
5403 unsigned EltMaskIdx, EltIdx;
5404 int Mask[4];
5405 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5406 if (Zeroable[EltIdx]) {
5407 // The zero vector will be on the right hand side.
5408 Mask[EltIdx] = EltIdx+4;
5409 continue;
5410 }
5411
5412 Elt = Op->getOperand(EltIdx);
5413 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5414 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5415 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5416 break;
5417 Mask[EltIdx] = EltIdx;
5418 }
5419
5420 if (EltIdx == 4) {
5421 // Let the shuffle legalizer deal with blend operations.
5422 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5423 if (V1.getSimpleValueType() != VT)
5424 V1 = DAG.getBitcast(VT, V1);
5425 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
5426 }
5427
5428 // See if we can lower this build_vector to a INSERTPS.
5429 if (!Subtarget.hasSSE41())
5430 return SDValue();
5431
5432 SDValue V2 = Elt.getOperand(0);
5433 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5434 V1 = SDValue();
5435
5436 bool CanFold = true;
5437 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5438 if (Zeroable[i])
5439 continue;
5440
5441 SDValue Current = Op->getOperand(i);
5442 SDValue SrcVector = Current->getOperand(0);
5443 if (!V1.getNode())
5444 V1 = SrcVector;
5445 CanFold = SrcVector == V1 &&
5446 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5447 }
5448
5449 if (!CanFold)
5450 return SDValue();
5451
5452 assert(V1.getNode() && "Expected at least two non-zero elements!");
5453 if (V1.getSimpleValueType() != MVT::v4f32)
5454 V1 = DAG.getBitcast(MVT::v4f32, V1);
5455 if (V2.getSimpleValueType() != MVT::v4f32)
5456 V2 = DAG.getBitcast(MVT::v4f32, V2);
5457
5458 // Ok, we can emit an INSERTPS instruction.
5459 unsigned ZMask = Zeroable.to_ulong();
5460
5461 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5462 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5463 SDLoc DL(Op);
5464 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
5465 DAG.getIntPtrConstant(InsertPSMask, DL));
5466 return DAG.getBitcast(VT, Result);
5467 }
5468
5469 /// Return a vector logical shift node.
getVShift(bool isLeft,EVT VT,SDValue SrcOp,unsigned NumBits,SelectionDAG & DAG,const TargetLowering & TLI,const SDLoc & dl)5470 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
5471 SelectionDAG &DAG, const TargetLowering &TLI,
5472 const SDLoc &dl) {
5473 assert(VT.is128BitVector() && "Unknown type for VShift");
5474 MVT ShVT = MVT::v16i8;
5475 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5476 SrcOp = DAG.getBitcast(ShVT, SrcOp);
5477 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
5478 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
5479 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
5480 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5481 }
5482
LowerAsSplatVectorLoad(SDValue SrcOp,MVT VT,const SDLoc & dl,SelectionDAG & DAG)5483 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
5484 SelectionDAG &DAG) {
5485
5486 // Check if the scalar load can be widened into a vector load. And if
5487 // the address is "base + cst" see if the cst can be "absorbed" into
5488 // the shuffle mask.
5489 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5490 SDValue Ptr = LD->getBasePtr();
5491 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5492 return SDValue();
5493 EVT PVT = LD->getValueType(0);
5494 if (PVT != MVT::i32 && PVT != MVT::f32)
5495 return SDValue();
5496
5497 int FI = -1;
5498 int64_t Offset = 0;
5499 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5500 FI = FINode->getIndex();
5501 Offset = 0;
5502 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5503 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5504 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5505 Offset = Ptr.getConstantOperandVal(1);
5506 Ptr = Ptr.getOperand(0);
5507 } else {
5508 return SDValue();
5509 }
5510
5511 // FIXME: 256-bit vector instructions don't require a strict alignment,
5512 // improve this code to support it better.
5513 unsigned RequiredAlign = VT.getSizeInBits()/8;
5514 SDValue Chain = LD->getChain();
5515 // Make sure the stack object alignment is at least 16 or 32.
5516 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5517 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5518 if (MFI->isFixedObjectIndex(FI)) {
5519 // Can't change the alignment. FIXME: It's possible to compute
5520 // the exact stack offset and reference FI + adjust offset instead.
5521 // If someone *really* cares about this. That's the way to implement it.
5522 return SDValue();
5523 } else {
5524 MFI->setObjectAlignment(FI, RequiredAlign);
5525 }
5526 }
5527
5528 // (Offset % 16 or 32) must be multiple of 4. Then address is then
5529 // Ptr + (Offset & ~15).
5530 if (Offset < 0)
5531 return SDValue();
5532 if ((Offset % RequiredAlign) & 3)
5533 return SDValue();
5534 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
5535 if (StartOffset) {
5536 SDLoc DL(Ptr);
5537 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5538 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
5539 }
5540
5541 int EltNo = (Offset - StartOffset) >> 2;
5542 unsigned NumElems = VT.getVectorNumElements();
5543
5544 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5545 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5546 LD->getPointerInfo().getWithOffset(StartOffset),
5547 false, false, false, 0);
5548
5549 SmallVector<int, 8> Mask(NumElems, EltNo);
5550
5551 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
5552 }
5553
5554 return SDValue();
5555 }
5556
5557 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
5558 /// elements can be replaced by a single large load which has the same value as
5559 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
5560 ///
5561 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
EltsFromConsecutiveLoads(EVT VT,ArrayRef<SDValue> Elts,SDLoc & DL,SelectionDAG & DAG,bool isAfterLegalize)5562 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
5563 SDLoc &DL, SelectionDAG &DAG,
5564 bool isAfterLegalize) {
5565 unsigned NumElems = Elts.size();
5566
5567 int LastLoadedElt = -1;
5568 SmallBitVector LoadMask(NumElems, false);
5569 SmallBitVector ZeroMask(NumElems, false);
5570 SmallBitVector UndefMask(NumElems, false);
5571
5572 // For each element in the initializer, see if we've found a load, zero or an
5573 // undef.
5574 for (unsigned i = 0; i < NumElems; ++i) {
5575 SDValue Elt = peekThroughBitcasts(Elts[i]);
5576 if (!Elt.getNode())
5577 return SDValue();
5578
5579 if (Elt.isUndef())
5580 UndefMask[i] = true;
5581 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
5582 ZeroMask[i] = true;
5583 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
5584 LoadMask[i] = true;
5585 LastLoadedElt = i;
5586 // Each loaded element must be the correct fractional portion of the
5587 // requested vector load.
5588 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
5589 return SDValue();
5590 } else
5591 return SDValue();
5592 }
5593 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
5594 "Incomplete element masks");
5595
5596 // Handle Special Cases - all undef or undef/zero.
5597 if (UndefMask.count() == NumElems)
5598 return DAG.getUNDEF(VT);
5599
5600 // FIXME: Should we return this as a BUILD_VECTOR instead?
5601 if ((ZeroMask | UndefMask).count() == NumElems)
5602 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
5603 : DAG.getConstantFP(0.0, DL, VT);
5604
5605 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5606 int FirstLoadedElt = LoadMask.find_first();
5607 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
5608 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
5609 EVT LDBaseVT = EltBase.getValueType();
5610
5611 // Consecutive loads can contain UNDEFS but not ZERO elements.
5612 // Consecutive loads with UNDEFs and ZEROs elements require a
5613 // an additional shuffle stage to clear the ZERO elements.
5614 bool IsConsecutiveLoad = true;
5615 bool IsConsecutiveLoadWithZeros = true;
5616 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
5617 if (LoadMask[i]) {
5618 SDValue Elt = peekThroughBitcasts(Elts[i]);
5619 LoadSDNode *LD = cast<LoadSDNode>(Elt);
5620 if (!DAG.areNonVolatileConsecutiveLoads(
5621 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
5622 i - FirstLoadedElt)) {
5623 IsConsecutiveLoad = false;
5624 IsConsecutiveLoadWithZeros = false;
5625 break;
5626 }
5627 } else if (ZeroMask[i]) {
5628 IsConsecutiveLoad = false;
5629 }
5630 }
5631
5632 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
5633 SDValue NewLd = DAG.getLoad(
5634 VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5635 LDBase->getPointerInfo(), false /*LDBase->isVolatile()*/,
5636 LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment());
5637
5638 if (LDBase->hasAnyUseOfValue(1)) {
5639 SDValue NewChain =
5640 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5641 SDValue(NewLd.getNode(), 1));
5642 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5643 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5644 SDValue(NewLd.getNode(), 1));
5645 }
5646
5647 return NewLd;
5648 };
5649
5650 // LOAD - all consecutive load/undefs (must start/end with a load).
5651 // If we have found an entire vector of loads and undefs, then return a large
5652 // load of the entire vector width starting at the base pointer.
5653 // If the vector contains zeros, then attempt to shuffle those elements.
5654 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
5655 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
5656 assert(LDBase && "Did not find base load for merging consecutive loads");
5657 EVT EltVT = LDBase->getValueType(0);
5658 // Ensure that the input vector size for the merged loads matches the
5659 // cumulative size of the input elements.
5660 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
5661 return SDValue();
5662
5663 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
5664 return SDValue();
5665
5666 if (IsConsecutiveLoad)
5667 return CreateLoad(VT, LDBase);
5668
5669 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
5670 // vector and a zero vector to clear out the zero elements.
5671 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
5672 SmallVector<int, 4> ClearMask(NumElems, -1);
5673 for (unsigned i = 0; i < NumElems; ++i) {
5674 if (ZeroMask[i])
5675 ClearMask[i] = i + NumElems;
5676 else if (LoadMask[i])
5677 ClearMask[i] = i;
5678 }
5679 SDValue V = CreateLoad(VT, LDBase);
5680 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
5681 : DAG.getConstantFP(0.0, DL, VT);
5682 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
5683 }
5684 }
5685
5686 int LoadSize =
5687 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
5688
5689 // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
5690 if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
5691 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5692 MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
5693 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
5694 if (TLI.isTypeLegal(VecVT)) {
5695 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
5696 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5697 SDValue ResNode =
5698 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
5699 LDBase->getPointerInfo(),
5700 LDBase->getAlignment(),
5701 false/*isVolatile*/, true/*ReadMem*/,
5702 false/*WriteMem*/);
5703
5704 // Make sure the newly-created LOAD is in the same position as LDBase in
5705 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
5706 // and update uses of LDBase's output chain to use the TokenFactor.
5707 if (LDBase->hasAnyUseOfValue(1)) {
5708 SDValue NewChain =
5709 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5710 SDValue(ResNode.getNode(), 1));
5711 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5712 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5713 SDValue(ResNode.getNode(), 1));
5714 }
5715
5716 return DAG.getBitcast(VT, ResNode);
5717 }
5718 }
5719
5720 // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
5721 if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
5722 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5723 MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
5724 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
5725 if (TLI.isTypeLegal(VecVT)) {
5726 SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
5727 : DAG.getBitcast(VecSVT, EltBase);
5728 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
5729 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
5730 return DAG.getBitcast(VT, V);
5731 }
5732 }
5733
5734 return SDValue();
5735 }
5736
5737 /// Attempt to use the vbroadcast instruction to generate a splat value for the
5738 /// following cases:
5739 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5740 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5741 /// a scalar load, or a constant.
5742 /// The VBROADCAST node is returned when a pattern is found,
5743 /// or SDValue() otherwise.
LowerVectorBroadcast(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)5744 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
5745 SelectionDAG &DAG) {
5746 // VBROADCAST requires AVX.
5747 // TODO: Splats could be generated for non-AVX CPUs using SSE
5748 // instructions, but there's less potential gain for only 128-bit vectors.
5749 if (!Subtarget.hasAVX())
5750 return SDValue();
5751
5752 MVT VT = Op.getSimpleValueType();
5753 SDLoc dl(Op);
5754
5755 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5756 "Unsupported vector type for broadcast.");
5757
5758 SDValue Ld;
5759 bool ConstSplatVal;
5760
5761 switch (Op.getOpcode()) {
5762 default:
5763 // Unknown pattern found.
5764 return SDValue();
5765
5766 case ISD::BUILD_VECTOR: {
5767 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
5768 BitVector UndefElements;
5769 SDValue Splat = BVOp->getSplatValue(&UndefElements);
5770
5771 // We need a splat of a single value to use broadcast, and it doesn't
5772 // make any sense if the value is only in one element of the vector.
5773 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
5774 return SDValue();
5775
5776 Ld = Splat;
5777 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5778 Ld.getOpcode() == ISD::ConstantFP);
5779
5780 // Make sure that all of the users of a non-constant load are from the
5781 // BUILD_VECTOR node.
5782 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
5783 return SDValue();
5784 break;
5785 }
5786
5787 case ISD::VECTOR_SHUFFLE: {
5788 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5789
5790 // Shuffles must have a splat mask where the first element is
5791 // broadcasted.
5792 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5793 return SDValue();
5794
5795 SDValue Sc = Op.getOperand(0);
5796 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5797 Sc.getOpcode() != ISD::BUILD_VECTOR) {
5798
5799 if (!Subtarget.hasInt256())
5800 return SDValue();
5801
5802 // Use the register form of the broadcast instruction available on AVX2.
5803 if (VT.getSizeInBits() >= 256)
5804 Sc = extract128BitVector(Sc, 0, DAG, dl);
5805 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5806 }
5807
5808 Ld = Sc.getOperand(0);
5809 ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5810 Ld.getOpcode() == ISD::ConstantFP);
5811
5812 // The scalar_to_vector node and the suspected
5813 // load node must have exactly one user.
5814 // Constants may have multiple users.
5815
5816 // AVX-512 has register version of the broadcast
5817 bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
5818 Ld.getValueType().getSizeInBits() >= 32;
5819 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5820 !hasRegVer))
5821 return SDValue();
5822 break;
5823 }
5824 }
5825
5826 unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5827 bool IsGE256 = (VT.getSizeInBits() >= 256);
5828
5829 // When optimizing for size, generate up to 5 extra bytes for a broadcast
5830 // instruction to save 8 or more bytes of constant pool data.
5831 // TODO: If multiple splats are generated to load the same constant,
5832 // it may be detrimental to overall size. There needs to be a way to detect
5833 // that condition to know if this is truly a size win.
5834 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
5835
5836 // Handle broadcasting a single constant scalar from the constant pool
5837 // into a vector.
5838 // On Sandybridge (no AVX2), it is still better to load a constant vector
5839 // from the constant pool and not to broadcast it from a scalar.
5840 // But override that restriction when optimizing for size.
5841 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
5842 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
5843 EVT CVT = Ld.getValueType();
5844 assert(!CVT.isVector() && "Must not broadcast a vector type");
5845
5846 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
5847 // For size optimization, also splat v2f64 and v2i64, and for size opt
5848 // with AVX2, also splat i8 and i16.
5849 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
5850 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5851 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
5852 const Constant *C = nullptr;
5853 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5854 C = CI->getConstantIntValue();
5855 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5856 C = CF->getConstantFPValue();
5857
5858 assert(C && "Invalid constant type");
5859
5860 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5861 SDValue CP =
5862 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
5863 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5864 Ld = DAG.getLoad(
5865 CVT, dl, DAG.getEntryNode(), CP,
5866 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
5867 false, false, Alignment);
5868
5869 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5870 }
5871 }
5872
5873 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5874
5875 // Handle AVX2 in-register broadcasts.
5876 if (!IsLoad && Subtarget.hasInt256() &&
5877 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5878 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5879
5880 // The scalar source must be a normal load.
5881 if (!IsLoad)
5882 return SDValue();
5883
5884 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5885 (Subtarget.hasVLX() && ScalarSize == 64))
5886 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5887
5888 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5889 // double since there is no vbroadcastsd xmm
5890 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
5891 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5892 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5893 }
5894
5895 // Unsupported broadcast.
5896 return SDValue();
5897 }
5898
5899 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
5900 /// underlying vector and index.
5901 ///
5902 /// Modifies \p ExtractedFromVec to the real vector and returns the real
5903 /// index.
getUnderlyingExtractedFromVec(SDValue & ExtractedFromVec,SDValue ExtIdx)5904 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
5905 SDValue ExtIdx) {
5906 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5907 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
5908 return Idx;
5909
5910 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
5911 // lowered this:
5912 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
5913 // to:
5914 // (extract_vector_elt (vector_shuffle<2,u,u,u>
5915 // (extract_subvector (v8f32 %vreg0), Constant<4>),
5916 // undef)
5917 // Constant<0>)
5918 // In this case the vector is the extract_subvector expression and the index
5919 // is 2, as specified by the shuffle.
5920 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
5921 SDValue ShuffleVec = SVOp->getOperand(0);
5922 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
5923 assert(ShuffleVecVT.getVectorElementType() ==
5924 ExtractedFromVec.getSimpleValueType().getVectorElementType());
5925
5926 int ShuffleIdx = SVOp->getMaskElt(Idx);
5927 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
5928 ExtractedFromVec = ShuffleVec;
5929 return ShuffleIdx;
5930 }
5931 return Idx;
5932 }
5933
buildFromShuffleMostly(SDValue Op,SelectionDAG & DAG)5934 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
5935 MVT VT = Op.getSimpleValueType();
5936
5937 // Skip if insert_vec_elt is not supported.
5938 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5939 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5940 return SDValue();
5941
5942 SDLoc DL(Op);
5943 unsigned NumElems = Op.getNumOperands();
5944
5945 SDValue VecIn1;
5946 SDValue VecIn2;
5947 SmallVector<unsigned, 4> InsertIndices;
5948 SmallVector<int, 8> Mask(NumElems, -1);
5949
5950 for (unsigned i = 0; i != NumElems; ++i) {
5951 unsigned Opc = Op.getOperand(i).getOpcode();
5952
5953 if (Opc == ISD::UNDEF)
5954 continue;
5955
5956 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5957 // Quit if more than 1 elements need inserting.
5958 if (InsertIndices.size() > 1)
5959 return SDValue();
5960
5961 InsertIndices.push_back(i);
5962 continue;
5963 }
5964
5965 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5966 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5967 // Quit if non-constant index.
5968 if (!isa<ConstantSDNode>(ExtIdx))
5969 return SDValue();
5970 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
5971
5972 // Quit if extracted from vector of different type.
5973 if (ExtractedFromVec.getValueType() != VT)
5974 return SDValue();
5975
5976 if (!VecIn1.getNode())
5977 VecIn1 = ExtractedFromVec;
5978 else if (VecIn1 != ExtractedFromVec) {
5979 if (!VecIn2.getNode())
5980 VecIn2 = ExtractedFromVec;
5981 else if (VecIn2 != ExtractedFromVec)
5982 // Quit if more than 2 vectors to shuffle
5983 return SDValue();
5984 }
5985
5986 if (ExtractedFromVec == VecIn1)
5987 Mask[i] = Idx;
5988 else if (ExtractedFromVec == VecIn2)
5989 Mask[i] = Idx + NumElems;
5990 }
5991
5992 if (!VecIn1.getNode())
5993 return SDValue();
5994
5995 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5996 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
5997 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5998 unsigned Idx = InsertIndices[i];
5999 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6000 DAG.getIntPtrConstant(Idx, DL));
6001 }
6002
6003 return NV;
6004 }
6005
ConvertI1VectorToInteger(SDValue Op,SelectionDAG & DAG)6006 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6007 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6008 Op.getScalarValueSizeInBits() == 1 &&
6009 "Can not convert non-constant vector");
6010 uint64_t Immediate = 0;
6011 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6012 SDValue In = Op.getOperand(idx);
6013 if (!In.isUndef())
6014 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6015 }
6016 SDLoc dl(Op);
6017 MVT VT =
6018 MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
6019 return DAG.getConstant(Immediate, dl, VT);
6020 }
6021 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6022 SDValue
LowerBUILD_VECTORvXi1(SDValue Op,SelectionDAG & DAG) const6023 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6024
6025 MVT VT = Op.getSimpleValueType();
6026 assert((VT.getVectorElementType() == MVT::i1) &&
6027 "Unexpected type in LowerBUILD_VECTORvXi1!");
6028
6029 SDLoc dl(Op);
6030 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6031 return DAG.getTargetConstant(0, dl, VT);
6032
6033 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6034 return DAG.getTargetConstant(1, dl, VT);
6035
6036 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6037 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6038 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6039 return DAG.getBitcast(VT, Imm);
6040 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6041 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6042 DAG.getIntPtrConstant(0, dl));
6043 }
6044
6045 // Vector has one or more non-const elements
6046 uint64_t Immediate = 0;
6047 SmallVector<unsigned, 16> NonConstIdx;
6048 bool IsSplat = true;
6049 bool HasConstElts = false;
6050 int SplatIdx = -1;
6051 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6052 SDValue In = Op.getOperand(idx);
6053 if (In.isUndef())
6054 continue;
6055 if (!isa<ConstantSDNode>(In))
6056 NonConstIdx.push_back(idx);
6057 else {
6058 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6059 HasConstElts = true;
6060 }
6061 if (SplatIdx < 0)
6062 SplatIdx = idx;
6063 else if (In != Op.getOperand(SplatIdx))
6064 IsSplat = false;
6065 }
6066
6067 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6068 if (IsSplat)
6069 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6070 DAG.getConstant(1, dl, VT),
6071 DAG.getConstant(0, dl, VT));
6072
6073 // insert elements one by one
6074 SDValue DstVec;
6075 SDValue Imm;
6076 if (Immediate) {
6077 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6078 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6079 }
6080 else if (HasConstElts)
6081 Imm = DAG.getConstant(0, dl, VT);
6082 else
6083 Imm = DAG.getUNDEF(VT);
6084 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6085 DstVec = DAG.getBitcast(VT, Imm);
6086 else {
6087 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6088 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6089 DAG.getIntPtrConstant(0, dl));
6090 }
6091
6092 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6093 unsigned InsertIdx = NonConstIdx[i];
6094 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6095 Op.getOperand(InsertIdx),
6096 DAG.getIntPtrConstant(InsertIdx, dl));
6097 }
6098 return DstVec;
6099 }
6100
6101 /// \brief Return true if \p N implements a horizontal binop and return the
6102 /// operands for the horizontal binop into V0 and V1.
6103 ///
6104 /// This is a helper function of LowerToHorizontalOp().
6105 /// This function checks that the build_vector \p N in input implements a
6106 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6107 /// operation to match.
6108 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6109 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6110 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6111 /// arithmetic sub.
6112 ///
6113 /// This function only analyzes elements of \p N whose indices are
6114 /// in range [BaseIdx, LastIdx).
isHorizontalBinOp(const BuildVectorSDNode * N,unsigned Opcode,SelectionDAG & DAG,unsigned BaseIdx,unsigned LastIdx,SDValue & V0,SDValue & V1)6115 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6116 SelectionDAG &DAG,
6117 unsigned BaseIdx, unsigned LastIdx,
6118 SDValue &V0, SDValue &V1) {
6119 EVT VT = N->getValueType(0);
6120
6121 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6122 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6123 "Invalid Vector in input!");
6124
6125 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6126 bool CanFold = true;
6127 unsigned ExpectedVExtractIdx = BaseIdx;
6128 unsigned NumElts = LastIdx - BaseIdx;
6129 V0 = DAG.getUNDEF(VT);
6130 V1 = DAG.getUNDEF(VT);
6131
6132 // Check if N implements a horizontal binop.
6133 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6134 SDValue Op = N->getOperand(i + BaseIdx);
6135
6136 // Skip UNDEFs.
6137 if (Op->isUndef()) {
6138 // Update the expected vector extract index.
6139 if (i * 2 == NumElts)
6140 ExpectedVExtractIdx = BaseIdx;
6141 ExpectedVExtractIdx += 2;
6142 continue;
6143 }
6144
6145 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6146
6147 if (!CanFold)
6148 break;
6149
6150 SDValue Op0 = Op.getOperand(0);
6151 SDValue Op1 = Op.getOperand(1);
6152
6153 // Try to match the following pattern:
6154 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6155 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6156 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6157 Op0.getOperand(0) == Op1.getOperand(0) &&
6158 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6159 isa<ConstantSDNode>(Op1.getOperand(1)));
6160 if (!CanFold)
6161 break;
6162
6163 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6164 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6165
6166 if (i * 2 < NumElts) {
6167 if (V0.isUndef()) {
6168 V0 = Op0.getOperand(0);
6169 if (V0.getValueType() != VT)
6170 return false;
6171 }
6172 } else {
6173 if (V1.isUndef()) {
6174 V1 = Op0.getOperand(0);
6175 if (V1.getValueType() != VT)
6176 return false;
6177 }
6178 if (i * 2 == NumElts)
6179 ExpectedVExtractIdx = BaseIdx;
6180 }
6181
6182 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6183 if (I0 == ExpectedVExtractIdx)
6184 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6185 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6186 // Try to match the following dag sequence:
6187 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6188 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6189 } else
6190 CanFold = false;
6191
6192 ExpectedVExtractIdx += 2;
6193 }
6194
6195 return CanFold;
6196 }
6197
6198 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6199 /// a concat_vector.
6200 ///
6201 /// This is a helper function of LowerToHorizontalOp().
6202 /// This function expects two 256-bit vectors called V0 and V1.
6203 /// At first, each vector is split into two separate 128-bit vectors.
6204 /// Then, the resulting 128-bit vectors are used to implement two
6205 /// horizontal binary operations.
6206 ///
6207 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6208 ///
6209 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6210 /// the two new horizontal binop.
6211 /// When Mode is set, the first horizontal binop dag node would take as input
6212 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6213 /// horizontal binop dag node would take as input the lower 128-bit of V1
6214 /// and the upper 128-bit of V1.
6215 /// Example:
6216 /// HADD V0_LO, V0_HI
6217 /// HADD V1_LO, V1_HI
6218 ///
6219 /// Otherwise, the first horizontal binop dag node takes as input the lower
6220 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6221 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6222 /// Example:
6223 /// HADD V0_LO, V1_LO
6224 /// HADD V0_HI, V1_HI
6225 ///
6226 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6227 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6228 /// the upper 128-bits of the result.
ExpandHorizontalBinOp(const SDValue & V0,const SDValue & V1,const SDLoc & DL,SelectionDAG & DAG,unsigned X86Opcode,bool Mode,bool isUndefLO,bool isUndefHI)6229 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6230 const SDLoc &DL, SelectionDAG &DAG,
6231 unsigned X86Opcode, bool Mode,
6232 bool isUndefLO, bool isUndefHI) {
6233 MVT VT = V0.getSimpleValueType();
6234 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6235 "Invalid nodes in input!");
6236
6237 unsigned NumElts = VT.getVectorNumElements();
6238 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6239 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6240 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6241 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6242 MVT NewVT = V0_LO.getSimpleValueType();
6243
6244 SDValue LO = DAG.getUNDEF(NewVT);
6245 SDValue HI = DAG.getUNDEF(NewVT);
6246
6247 if (Mode) {
6248 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6249 if (!isUndefLO && !V0->isUndef())
6250 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6251 if (!isUndefHI && !V1->isUndef())
6252 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6253 } else {
6254 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6255 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6256 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6257
6258 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6259 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6260 }
6261
6262 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6263 }
6264
6265 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6266 /// node.
LowerToAddSub(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)6267 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6268 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6269 MVT VT = BV->getSimpleValueType(0);
6270 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6271 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6272 return SDValue();
6273
6274 SDLoc DL(BV);
6275 unsigned NumElts = VT.getVectorNumElements();
6276 SDValue InVec0 = DAG.getUNDEF(VT);
6277 SDValue InVec1 = DAG.getUNDEF(VT);
6278
6279 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6280 VT == MVT::v2f64) && "build_vector with an invalid type found!");
6281
6282 // Odd-numbered elements in the input build vector are obtained from
6283 // adding two integer/float elements.
6284 // Even-numbered elements in the input build vector are obtained from
6285 // subtracting two integer/float elements.
6286 unsigned ExpectedOpcode = ISD::FSUB;
6287 unsigned NextExpectedOpcode = ISD::FADD;
6288 bool AddFound = false;
6289 bool SubFound = false;
6290
6291 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6292 SDValue Op = BV->getOperand(i);
6293
6294 // Skip 'undef' values.
6295 unsigned Opcode = Op.getOpcode();
6296 if (Opcode == ISD::UNDEF) {
6297 std::swap(ExpectedOpcode, NextExpectedOpcode);
6298 continue;
6299 }
6300
6301 // Early exit if we found an unexpected opcode.
6302 if (Opcode != ExpectedOpcode)
6303 return SDValue();
6304
6305 SDValue Op0 = Op.getOperand(0);
6306 SDValue Op1 = Op.getOperand(1);
6307
6308 // Try to match the following pattern:
6309 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6310 // Early exit if we cannot match that sequence.
6311 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6312 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6313 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6314 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6315 Op0.getOperand(1) != Op1.getOperand(1))
6316 return SDValue();
6317
6318 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6319 if (I0 != i)
6320 return SDValue();
6321
6322 // We found a valid add/sub node. Update the information accordingly.
6323 if (i & 1)
6324 AddFound = true;
6325 else
6326 SubFound = true;
6327
6328 // Update InVec0 and InVec1.
6329 if (InVec0.isUndef()) {
6330 InVec0 = Op0.getOperand(0);
6331 if (InVec0.getSimpleValueType() != VT)
6332 return SDValue();
6333 }
6334 if (InVec1.isUndef()) {
6335 InVec1 = Op1.getOperand(0);
6336 if (InVec1.getSimpleValueType() != VT)
6337 return SDValue();
6338 }
6339
6340 // Make sure that operands in input to each add/sub node always
6341 // come from a same pair of vectors.
6342 if (InVec0 != Op0.getOperand(0)) {
6343 if (ExpectedOpcode == ISD::FSUB)
6344 return SDValue();
6345
6346 // FADD is commutable. Try to commute the operands
6347 // and then test again.
6348 std::swap(Op0, Op1);
6349 if (InVec0 != Op0.getOperand(0))
6350 return SDValue();
6351 }
6352
6353 if (InVec1 != Op1.getOperand(0))
6354 return SDValue();
6355
6356 // Update the pair of expected opcodes.
6357 std::swap(ExpectedOpcode, NextExpectedOpcode);
6358 }
6359
6360 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6361 if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
6362 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6363
6364 return SDValue();
6365 }
6366
6367 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
LowerToHorizontalOp(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)6368 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
6369 const X86Subtarget &Subtarget,
6370 SelectionDAG &DAG) {
6371 MVT VT = BV->getSimpleValueType(0);
6372 unsigned NumElts = VT.getVectorNumElements();
6373 unsigned NumUndefsLO = 0;
6374 unsigned NumUndefsHI = 0;
6375 unsigned Half = NumElts/2;
6376
6377 // Count the number of UNDEF operands in the build_vector in input.
6378 for (unsigned i = 0, e = Half; i != e; ++i)
6379 if (BV->getOperand(i)->isUndef())
6380 NumUndefsLO++;
6381
6382 for (unsigned i = Half, e = NumElts; i != e; ++i)
6383 if (BV->getOperand(i)->isUndef())
6384 NumUndefsHI++;
6385
6386 // Early exit if this is either a build_vector of all UNDEFs or all the
6387 // operands but one are UNDEF.
6388 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6389 return SDValue();
6390
6391 SDLoc DL(BV);
6392 SDValue InVec0, InVec1;
6393 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
6394 // Try to match an SSE3 float HADD/HSUB.
6395 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6396 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6397
6398 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6399 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6400 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
6401 // Try to match an SSSE3 integer HADD/HSUB.
6402 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6403 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6404
6405 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6406 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6407 }
6408
6409 if (!Subtarget.hasAVX())
6410 return SDValue();
6411
6412 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6413 // Try to match an AVX horizontal add/sub of packed single/double
6414 // precision floating point values from 256-bit vectors.
6415 SDValue InVec2, InVec3;
6416 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6417 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6418 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6419 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6420 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6421
6422 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6423 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6424 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6425 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6426 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6427 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6428 // Try to match an AVX2 horizontal add/sub of signed integers.
6429 SDValue InVec2, InVec3;
6430 unsigned X86Opcode;
6431 bool CanFold = true;
6432
6433 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6434 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6435 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6436 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6437 X86Opcode = X86ISD::HADD;
6438 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6439 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6440 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6441 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6442 X86Opcode = X86ISD::HSUB;
6443 else
6444 CanFold = false;
6445
6446 if (CanFold) {
6447 // Fold this build_vector into a single horizontal add/sub.
6448 // Do this only if the target has AVX2.
6449 if (Subtarget.hasAVX2())
6450 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6451
6452 // Do not try to expand this build_vector into a pair of horizontal
6453 // add/sub if we can emit a pair of scalar add/sub.
6454 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6455 return SDValue();
6456
6457 // Convert this build_vector into a pair of horizontal binop followed by
6458 // a concat vector.
6459 bool isUndefLO = NumUndefsLO == Half;
6460 bool isUndefHI = NumUndefsHI == Half;
6461 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6462 isUndefLO, isUndefHI);
6463 }
6464 }
6465
6466 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6467 VT == MVT::v16i16) && Subtarget.hasAVX()) {
6468 unsigned X86Opcode;
6469 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6470 X86Opcode = X86ISD::HADD;
6471 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6472 X86Opcode = X86ISD::HSUB;
6473 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6474 X86Opcode = X86ISD::FHADD;
6475 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6476 X86Opcode = X86ISD::FHSUB;
6477 else
6478 return SDValue();
6479
6480 // Don't try to expand this build_vector into a pair of horizontal add/sub
6481 // if we can simply emit a pair of scalar add/sub.
6482 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6483 return SDValue();
6484
6485 // Convert this build_vector into two horizontal add/sub followed by
6486 // a concat vector.
6487 bool isUndefLO = NumUndefsLO == Half;
6488 bool isUndefHI = NumUndefsHI == Half;
6489 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6490 isUndefLO, isUndefHI);
6491 }
6492
6493 return SDValue();
6494 }
6495
6496 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
6497 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
6498 /// just apply the bit to the vectors.
6499 /// NOTE: Its not in our interest to start make a general purpose vectorizer
6500 /// from this, but enough scalar bit operations are created from the later
6501 /// legalization + scalarization stages to need basic support.
lowerBuildVectorToBitOp(SDValue Op,SelectionDAG & DAG)6502 static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
6503 SDLoc DL(Op);
6504 MVT VT = Op.getSimpleValueType();
6505 unsigned NumElems = VT.getVectorNumElements();
6506 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6507
6508 // Check that all elements have the same opcode.
6509 // TODO: Should we allow UNDEFS and if so how many?
6510 unsigned Opcode = Op.getOperand(0).getOpcode();
6511 for (unsigned i = 1; i < NumElems; ++i)
6512 if (Opcode != Op.getOperand(i).getOpcode())
6513 return SDValue();
6514
6515 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
6516 switch (Opcode) {
6517 default:
6518 return SDValue();
6519 case ISD::AND:
6520 case ISD::XOR:
6521 case ISD::OR:
6522 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
6523 return SDValue();
6524 break;
6525 }
6526
6527 SmallVector<SDValue, 4> LHSElts, RHSElts;
6528 for (SDValue Elt : Op->ops()) {
6529 SDValue LHS = Elt.getOperand(0);
6530 SDValue RHS = Elt.getOperand(1);
6531
6532 // We expect the canonicalized RHS operand to be the constant.
6533 if (!isa<ConstantSDNode>(RHS))
6534 return SDValue();
6535 LHSElts.push_back(LHS);
6536 RHSElts.push_back(RHS);
6537 }
6538
6539 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
6540 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
6541 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
6542 }
6543
6544 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
6545 /// functionality to do this, so it's all zeros, all ones, or some derivation
6546 /// that is cheap to calculate.
materializeVectorConstant(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)6547 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
6548 const X86Subtarget &Subtarget) {
6549 SDLoc DL(Op);
6550 MVT VT = Op.getSimpleValueType();
6551
6552 // Vectors containing all zeros can be matched by pxor and xorps.
6553 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6554 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6555 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6556 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6557 return Op;
6558
6559 return getZeroVector(VT, Subtarget, DAG, DL);
6560 }
6561
6562 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6563 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6564 // vpcmpeqd on 256-bit vectors.
6565 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6566 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
6567 (VT == MVT::v8i32 && Subtarget.hasInt256()))
6568 return Op;
6569
6570 return getOnesVector(VT, Subtarget, DAG, DL);
6571 }
6572
6573 return SDValue();
6574 }
6575
6576 SDValue
LowerBUILD_VECTOR(SDValue Op,SelectionDAG & DAG) const6577 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6578 SDLoc dl(Op);
6579
6580 MVT VT = Op.getSimpleValueType();
6581 MVT ExtVT = VT.getVectorElementType();
6582 unsigned NumElems = Op.getNumOperands();
6583
6584 // Generate vectors for predicate vectors.
6585 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
6586 return LowerBUILD_VECTORvXi1(Op, DAG);
6587
6588 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
6589 return VectorConstant;
6590
6591 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
6592 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
6593 return AddSub;
6594 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
6595 return HorizontalOp;
6596 if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
6597 return Broadcast;
6598 if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
6599 return BitOp;
6600
6601 unsigned EVTBits = ExtVT.getSizeInBits();
6602
6603 unsigned NumZero = 0;
6604 unsigned NumNonZero = 0;
6605 uint64_t NonZeros = 0;
6606 bool IsAllConstants = true;
6607 SmallSet<SDValue, 8> Values;
6608 for (unsigned i = 0; i < NumElems; ++i) {
6609 SDValue Elt = Op.getOperand(i);
6610 if (Elt.isUndef())
6611 continue;
6612 Values.insert(Elt);
6613 if (Elt.getOpcode() != ISD::Constant &&
6614 Elt.getOpcode() != ISD::ConstantFP)
6615 IsAllConstants = false;
6616 if (X86::isZeroNode(Elt))
6617 NumZero++;
6618 else {
6619 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
6620 NonZeros |= ((uint64_t)1 << i);
6621 NumNonZero++;
6622 }
6623 }
6624
6625 // All undef vector. Return an UNDEF. All zero vectors were handled above.
6626 if (NumNonZero == 0)
6627 return DAG.getUNDEF(VT);
6628
6629 // Special case for single non-zero, non-undef, element.
6630 if (NumNonZero == 1) {
6631 unsigned Idx = countTrailingZeros(NonZeros);
6632 SDValue Item = Op.getOperand(Idx);
6633
6634 // If this is an insertion of an i64 value on x86-32, and if the top bits of
6635 // the value are obviously zero, truncate the value to i32 and do the
6636 // insertion that way. Only do this if the value is non-constant or if the
6637 // value is a constant being inserted into element 0. It is cheaper to do
6638 // a constant pool load than it is to do a movd + shuffle.
6639 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
6640 (!IsAllConstants || Idx == 0)) {
6641 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6642 // Handle SSE only.
6643 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6644 MVT VecVT = MVT::v4i32;
6645
6646 // Truncate the value (which may itself be a constant) to i32, and
6647 // convert it to a vector with movd (S2V+shuffle to zero extend).
6648 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6649 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6650 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
6651 Item, Idx * 2, true, Subtarget, DAG));
6652 }
6653 }
6654
6655 // If we have a constant or non-constant insertion into the low element of
6656 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6657 // the rest of the elements. This will be matched as movd/movq/movss/movsd
6658 // depending on what the source datatype is.
6659 if (Idx == 0) {
6660 if (NumZero == 0)
6661 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6662
6663 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6664 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
6665 if (VT.is512BitVector()) {
6666 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6667 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6668 Item, DAG.getIntPtrConstant(0, dl));
6669 }
6670 assert((VT.is128BitVector() || VT.is256BitVector()) &&
6671 "Expected an SSE value type!");
6672 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6673 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6674 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6675 }
6676
6677 // We can't directly insert an i8 or i16 into a vector, so zero extend
6678 // it to i32 first.
6679 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6680 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
6681 if (VT.getSizeInBits() >= 256) {
6682 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
6683 if (Subtarget.hasAVX()) {
6684 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
6685 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6686 } else {
6687 // Without AVX, we need to extend to a 128-bit vector and then
6688 // insert into the 256-bit vector.
6689 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6690 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
6691 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
6692 }
6693 } else {
6694 assert(VT.is128BitVector() && "Expected an SSE value type!");
6695 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6696 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6697 }
6698 return DAG.getBitcast(VT, Item);
6699 }
6700 }
6701
6702 // Is it a vector logical left shift?
6703 if (NumElems == 2 && Idx == 1 &&
6704 X86::isZeroNode(Op.getOperand(0)) &&
6705 !X86::isZeroNode(Op.getOperand(1))) {
6706 unsigned NumBits = VT.getSizeInBits();
6707 return getVShift(true, VT,
6708 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6709 VT, Op.getOperand(1)),
6710 NumBits/2, DAG, *this, dl);
6711 }
6712
6713 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
6714 return SDValue();
6715
6716 // Otherwise, if this is a vector with i32 or f32 elements, and the element
6717 // is a non-constant being inserted into an element other than the low one,
6718 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
6719 // movd/movss) to move this into the low element, then shuffle it into
6720 // place.
6721 if (EVTBits == 32) {
6722 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6723 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
6724 }
6725 }
6726
6727 // Splat is obviously ok. Let legalizer expand it to a shuffle.
6728 if (Values.size() == 1) {
6729 if (EVTBits == 32) {
6730 // Instead of a shuffle like this:
6731 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
6732 // Check if it's possible to issue this instead.
6733 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
6734 unsigned Idx = countTrailingZeros(NonZeros);
6735 SDValue Item = Op.getOperand(Idx);
6736 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
6737 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
6738 }
6739 return SDValue();
6740 }
6741
6742 // A vector full of immediates; various special cases are already
6743 // handled, so this is best done with a single constant-pool load.
6744 if (IsAllConstants)
6745 return SDValue();
6746
6747 // See if we can use a vector load to get all of the elements.
6748 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
6749 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6750 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
6751 return LD;
6752 }
6753
6754 // For AVX-length vectors, build the individual 128-bit pieces and use
6755 // shuffles to put them in place.
6756 if (VT.is256BitVector() || VT.is512BitVector()) {
6757 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6758
6759 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
6760
6761 // Build both the lower and upper subvector.
6762 SDValue Lower =
6763 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
6764 SDValue Upper = DAG.getBuildVector(
6765 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
6766
6767 // Recreate the wider vector with the lower and upper part.
6768 if (VT.is256BitVector())
6769 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6770 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6771 }
6772
6773 // Let legalizer expand 2-wide build_vectors.
6774 if (EVTBits == 64) {
6775 if (NumNonZero == 1) {
6776 // One half is zero or undef.
6777 unsigned Idx = countTrailingZeros(NonZeros);
6778 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
6779 Op.getOperand(Idx));
6780 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
6781 }
6782 return SDValue();
6783 }
6784
6785 // If element VT is < 32 bits, convert it to inserts into a zero vector.
6786 if (EVTBits == 8 && NumElems == 16)
6787 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
6788 DAG, Subtarget, *this))
6789 return V;
6790
6791 if (EVTBits == 16 && NumElems == 8)
6792 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
6793 DAG, Subtarget, *this))
6794 return V;
6795
6796 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
6797 if (EVTBits == 32 && NumElems == 4)
6798 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
6799 return V;
6800
6801 // If element VT is == 32 bits, turn it into a number of shuffles.
6802 if (NumElems == 4 && NumZero > 0) {
6803 SmallVector<SDValue, 8> Ops(NumElems);
6804 for (unsigned i = 0; i < 4; ++i) {
6805 bool isZero = !(NonZeros & (1ULL << i));
6806 if (isZero)
6807 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
6808 else
6809 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6810 }
6811
6812 for (unsigned i = 0; i < 2; ++i) {
6813 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
6814 default: break;
6815 case 0:
6816 Ops[i] = Ops[i*2]; // Must be a zero vector.
6817 break;
6818 case 1:
6819 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
6820 break;
6821 case 2:
6822 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6823 break;
6824 case 3:
6825 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6826 break;
6827 }
6828 }
6829
6830 bool Reverse1 = (NonZeros & 0x3) == 2;
6831 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
6832 int MaskVec[] = {
6833 Reverse1 ? 1 : 0,
6834 Reverse1 ? 0 : 1,
6835 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6836 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
6837 };
6838 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
6839 }
6840
6841 if (Values.size() > 1 && VT.is128BitVector()) {
6842 // Check for a build vector from mostly shuffle plus few inserting.
6843 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
6844 return Sh;
6845
6846 // For SSE 4.1, use insertps to put the high elements into the low element.
6847 if (Subtarget.hasSSE41()) {
6848 SDValue Result;
6849 if (!Op.getOperand(0).isUndef())
6850 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6851 else
6852 Result = DAG.getUNDEF(VT);
6853
6854 for (unsigned i = 1; i < NumElems; ++i) {
6855 if (Op.getOperand(i).isUndef()) continue;
6856 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6857 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6858 }
6859 return Result;
6860 }
6861
6862 // Otherwise, expand into a number of unpckl*, start by extending each of
6863 // our (non-undef) elements to the full vector width with the element in the
6864 // bottom slot of the vector (which generates no code for SSE).
6865 SmallVector<SDValue, 8> Ops(NumElems);
6866 for (unsigned i = 0; i < NumElems; ++i) {
6867 if (!Op.getOperand(i).isUndef())
6868 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6869 else
6870 Ops[i] = DAG.getUNDEF(VT);
6871 }
6872
6873 // Next, we iteratively mix elements, e.g. for v4f32:
6874 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6875 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6876 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
6877 unsigned EltStride = NumElems >> 1;
6878 while (EltStride != 0) {
6879 for (unsigned i = 0; i < EltStride; ++i) {
6880 // If Ops[i+EltStride] is undef and this is the first round of mixing,
6881 // then it is safe to just drop this shuffle: V[i] is already in the
6882 // right place, the one element (since it's the first round) being
6883 // inserted as undef can be dropped. This isn't safe for successive
6884 // rounds because they will permute elements within both vectors.
6885 if (Ops[i+EltStride].isUndef() &&
6886 EltStride == NumElems/2)
6887 continue;
6888
6889 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
6890 }
6891 EltStride >>= 1;
6892 }
6893 return Ops[0];
6894 }
6895 return SDValue();
6896 }
6897
6898 // 256-bit AVX can use the vinsertf128 instruction
6899 // to create 256-bit vectors from two other 128-bit ones.
LowerAVXCONCAT_VECTORS(SDValue Op,SelectionDAG & DAG)6900 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6901 SDLoc dl(Op);
6902 MVT ResVT = Op.getSimpleValueType();
6903
6904 assert((ResVT.is256BitVector() ||
6905 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6906
6907 SDValue V1 = Op.getOperand(0);
6908 SDValue V2 = Op.getOperand(1);
6909 unsigned NumElems = ResVT.getVectorNumElements();
6910 if (ResVT.is256BitVector())
6911 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6912
6913 if (Op.getNumOperands() == 4) {
6914 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6915 ResVT.getVectorNumElements()/2);
6916 SDValue V3 = Op.getOperand(2);
6917 SDValue V4 = Op.getOperand(3);
6918 return concat256BitVectors(
6919 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
6920 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
6921 NumElems, DAG, dl);
6922 }
6923 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6924 }
6925
LowerCONCAT_VECTORSvXi1(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)6926 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
6927 const X86Subtarget &Subtarget,
6928 SelectionDAG & DAG) {
6929 SDLoc dl(Op);
6930 MVT ResVT = Op.getSimpleValueType();
6931 unsigned NumOfOperands = Op.getNumOperands();
6932
6933 assert(isPowerOf2_32(NumOfOperands) &&
6934 "Unexpected number of operands in CONCAT_VECTORS");
6935
6936 SDValue Undef = DAG.getUNDEF(ResVT);
6937 if (NumOfOperands > 2) {
6938 // Specialize the cases when all, or all but one, of the operands are undef.
6939 unsigned NumOfDefinedOps = 0;
6940 unsigned OpIdx = 0;
6941 for (unsigned i = 0; i < NumOfOperands; i++)
6942 if (!Op.getOperand(i).isUndef()) {
6943 NumOfDefinedOps++;
6944 OpIdx = i;
6945 }
6946 if (NumOfDefinedOps == 0)
6947 return Undef;
6948 if (NumOfDefinedOps == 1) {
6949 unsigned SubVecNumElts =
6950 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
6951 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
6952 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
6953 Op.getOperand(OpIdx), IdxVal);
6954 }
6955
6956 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6957 ResVT.getVectorNumElements()/2);
6958 SmallVector<SDValue, 2> Ops;
6959 for (unsigned i = 0; i < NumOfOperands/2; i++)
6960 Ops.push_back(Op.getOperand(i));
6961 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6962 Ops.clear();
6963 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
6964 Ops.push_back(Op.getOperand(i));
6965 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6966 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
6967 }
6968
6969 // 2 operands
6970 SDValue V1 = Op.getOperand(0);
6971 SDValue V2 = Op.getOperand(1);
6972 unsigned NumElems = ResVT.getVectorNumElements();
6973 assert(V1.getValueType() == V2.getValueType() &&
6974 V1.getValueType().getVectorNumElements() == NumElems/2 &&
6975 "Unexpected operands in CONCAT_VECTORS");
6976
6977 if (ResVT.getSizeInBits() >= 16)
6978 return Op; // The operation is legal with KUNPCK
6979
6980 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
6981 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
6982 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
6983 if (IsZeroV1 && IsZeroV2)
6984 return ZeroVec;
6985
6986 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6987 if (V2.isUndef())
6988 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
6989 if (IsZeroV2)
6990 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
6991
6992 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
6993 if (V1.isUndef())
6994 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
6995
6996 if (IsZeroV1)
6997 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
6998
6999 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7000 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7001 }
7002
LowerCONCAT_VECTORS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)7003 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7004 const X86Subtarget &Subtarget,
7005 SelectionDAG &DAG) {
7006 MVT VT = Op.getSimpleValueType();
7007 if (VT.getVectorElementType() == MVT::i1)
7008 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7009
7010 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7011 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7012 Op.getNumOperands() == 4)));
7013
7014 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7015 // from two other 128-bit ones.
7016
7017 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7018 return LowerAVXCONCAT_VECTORS(Op, DAG);
7019 }
7020
7021 //===----------------------------------------------------------------------===//
7022 // Vector shuffle lowering
7023 //
7024 // This is an experimental code path for lowering vector shuffles on x86. It is
7025 // designed to handle arbitrary vector shuffles and blends, gracefully
7026 // degrading performance as necessary. It works hard to recognize idiomatic
7027 // shuffles and lower them to optimal instruction patterns without leaving
7028 // a framework that allows reasonably efficient handling of all vector shuffle
7029 // patterns.
7030 //===----------------------------------------------------------------------===//
7031
7032 /// \brief Tiny helper function to identify a no-op mask.
7033 ///
7034 /// This is a somewhat boring predicate function. It checks whether the mask
7035 /// array input, which is assumed to be a single-input shuffle mask of the kind
7036 /// used by the X86 shuffle instructions (not a fully general
7037 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7038 /// in-place shuffle are 'no-op's.
isNoopShuffleMask(ArrayRef<int> Mask)7039 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7040 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7041 assert(Mask[i] >= -1 && "Out of bound mask element!");
7042 if (Mask[i] >= 0 && Mask[i] != i)
7043 return false;
7044 }
7045 return true;
7046 }
7047
7048 /// \brief Test whether there are elements crossing 128-bit lanes in this
7049 /// shuffle mask.
7050 ///
7051 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7052 /// and we routinely test for these.
is128BitLaneCrossingShuffleMask(MVT VT,ArrayRef<int> Mask)7053 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7054 int LaneSize = 128 / VT.getScalarSizeInBits();
7055 int Size = Mask.size();
7056 for (int i = 0; i < Size; ++i)
7057 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7058 return true;
7059 return false;
7060 }
7061
7062 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7063 ///
7064 /// This checks a shuffle mask to see if it is performing the same
7065 /// lane-relative shuffle in each sub-lane. This trivially implies
7066 /// that it is also not lane-crossing. It may however involve a blend from the
7067 /// same lane of a second vector.
7068 ///
7069 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7070 /// non-trivial to compute in the face of undef lanes. The representation is
7071 /// suitable for use with existing 128-bit shuffles as entries from the second
7072 /// vector have been remapped to [LaneSize, 2*LaneSize).
isRepeatedShuffleMask(unsigned LaneSizeInBits,MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)7073 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7074 ArrayRef<int> Mask,
7075 SmallVectorImpl<int> &RepeatedMask) {
7076 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7077 RepeatedMask.assign(LaneSize, -1);
7078 int Size = Mask.size();
7079 for (int i = 0; i < Size; ++i) {
7080 if (Mask[i] < 0)
7081 continue;
7082 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7083 // This entry crosses lanes, so there is no way to model this shuffle.
7084 return false;
7085
7086 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7087 // Adjust second vector indices to start at LaneSize instead of Size.
7088 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7089 : Mask[i] % LaneSize + LaneSize;
7090 if (RepeatedMask[i % LaneSize] < 0)
7091 // This is the first non-undef entry in this slot of a 128-bit lane.
7092 RepeatedMask[i % LaneSize] = LocalM;
7093 else if (RepeatedMask[i % LaneSize] != LocalM)
7094 // Found a mismatch with the repeated mask.
7095 return false;
7096 }
7097 return true;
7098 }
7099
7100 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7101 static bool
is128BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)7102 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7103 SmallVectorImpl<int> &RepeatedMask) {
7104 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7105 }
7106
7107 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7108 static bool
is256BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)7109 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7110 SmallVectorImpl<int> &RepeatedMask) {
7111 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7112 }
7113
scaleShuffleMask(int Scale,ArrayRef<int> Mask,SmallVectorImpl<int> & ScaledMask)7114 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
7115 SmallVectorImpl<int> &ScaledMask) {
7116 assert(0 < Scale && "Unexpected scaling factor");
7117 int NumElts = Mask.size();
7118 ScaledMask.assign(NumElts * Scale, -1);
7119
7120 for (int i = 0; i != NumElts; ++i) {
7121 int M = Mask[i];
7122
7123 // Repeat sentinel values in every mask element.
7124 if (M < 0) {
7125 for (int s = 0; s != Scale; ++s)
7126 ScaledMask[(Scale * i) + s] = M;
7127 continue;
7128 }
7129
7130 // Scale mask element and increment across each mask element.
7131 for (int s = 0; s != Scale; ++s)
7132 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
7133 }
7134 }
7135
7136 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7137 /// arguments.
7138 ///
7139 /// This is a fast way to test a shuffle mask against a fixed pattern:
7140 ///
7141 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7142 ///
7143 /// It returns true if the mask is exactly as wide as the argument list, and
7144 /// each element of the mask is either -1 (signifying undef) or the value given
7145 /// in the argument.
isShuffleEquivalent(SDValue V1,SDValue V2,ArrayRef<int> Mask,ArrayRef<int> ExpectedMask)7146 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7147 ArrayRef<int> ExpectedMask) {
7148 if (Mask.size() != ExpectedMask.size())
7149 return false;
7150
7151 int Size = Mask.size();
7152
7153 // If the values are build vectors, we can look through them to find
7154 // equivalent inputs that make the shuffles equivalent.
7155 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7156 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7157
7158 for (int i = 0; i < Size; ++i) {
7159 assert(Mask[i] >= -1 && "Out of bound mask element!");
7160 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7161 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7162 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7163 if (!MaskBV || !ExpectedBV ||
7164 MaskBV->getOperand(Mask[i] % Size) !=
7165 ExpectedBV->getOperand(ExpectedMask[i] % Size))
7166 return false;
7167 }
7168 }
7169
7170 return true;
7171 }
7172
7173 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7174 ///
7175 /// The masks must be exactly the same width.
7176 ///
7177 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7178 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7179 ///
7180 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
isTargetShuffleEquivalent(ArrayRef<int> Mask,ArrayRef<int> ExpectedMask)7181 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7182 ArrayRef<int> ExpectedMask) {
7183 int Size = Mask.size();
7184 if (Size != (int)ExpectedMask.size())
7185 return false;
7186
7187 for (int i = 0; i < Size; ++i)
7188 if (Mask[i] == SM_SentinelUndef)
7189 continue;
7190 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7191 return false;
7192 else if (Mask[i] != ExpectedMask[i])
7193 return false;
7194
7195 return true;
7196 }
7197
7198 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7199 ///
7200 /// This helper function produces an 8-bit shuffle immediate corresponding to
7201 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7202 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7203 /// example.
7204 ///
7205 /// NB: We rely heavily on "undef" masks preserving the input lane.
getV4X86ShuffleImm(ArrayRef<int> Mask)7206 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7207 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7208 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7209 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7210 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7211 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7212
7213 unsigned Imm = 0;
7214 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7215 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7216 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7217 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7218 return Imm;
7219 }
7220
getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,SDLoc DL,SelectionDAG & DAG)7221 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7222 SelectionDAG &DAG) {
7223 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7224 }
7225
7226 /// \brief Compute whether each element of a shuffle is zeroable.
7227 ///
7228 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7229 /// Either it is an undef element in the shuffle mask, the element of the input
7230 /// referenced is undef, or the element of the input referenced is known to be
7231 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7232 /// as many lanes with this technique as possible to simplify the remaining
7233 /// shuffle.
computeZeroableShuffleElements(ArrayRef<int> Mask,SDValue V1,SDValue V2)7234 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7235 SDValue V1, SDValue V2) {
7236 SmallBitVector Zeroable(Mask.size(), false);
7237 V1 = peekThroughBitcasts(V1);
7238 V2 = peekThroughBitcasts(V2);
7239
7240 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7241 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7242
7243 int VectorSizeInBits = V1.getValueType().getSizeInBits();
7244 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7245 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7246
7247 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7248 int M = Mask[i];
7249 // Handle the easy cases.
7250 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7251 Zeroable[i] = true;
7252 continue;
7253 }
7254
7255 // Determine shuffle input and normalize the mask.
7256 SDValue V = M < Size ? V1 : V2;
7257 M %= Size;
7258
7259 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7260 if (V.getOpcode() != ISD::BUILD_VECTOR)
7261 continue;
7262
7263 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7264 // the (larger) source element must be UNDEF/ZERO.
7265 if ((Size % V.getNumOperands()) == 0) {
7266 int Scale = Size / V->getNumOperands();
7267 SDValue Op = V.getOperand(M / Scale);
7268 if (Op.isUndef() || X86::isZeroNode(Op))
7269 Zeroable[i] = true;
7270 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7271 APInt Val = Cst->getAPIntValue();
7272 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7273 Val = Val.getLoBits(ScalarSizeInBits);
7274 Zeroable[i] = (Val == 0);
7275 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7276 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7277 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7278 Val = Val.getLoBits(ScalarSizeInBits);
7279 Zeroable[i] = (Val == 0);
7280 }
7281 continue;
7282 }
7283
7284 // If the BUILD_VECTOR has more elements then all the (smaller) source
7285 // elements must be UNDEF or ZERO.
7286 if ((V.getNumOperands() % Size) == 0) {
7287 int Scale = V->getNumOperands() / Size;
7288 bool AllZeroable = true;
7289 for (int j = 0; j < Scale; ++j) {
7290 SDValue Op = V.getOperand((M * Scale) + j);
7291 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
7292 }
7293 Zeroable[i] = AllZeroable;
7294 continue;
7295 }
7296 }
7297
7298 return Zeroable;
7299 }
7300
7301 /// Try to lower a shuffle with a single PSHUFB of V1.
7302 /// This is only possible if V2 is unused (at all, or only for zero elements).
lowerVectorShuffleWithPSHUFB(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)7303 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
7304 ArrayRef<int> Mask, SDValue V1,
7305 SDValue V2,
7306 const X86Subtarget &Subtarget,
7307 SelectionDAG &DAG) {
7308 int Size = Mask.size();
7309 int LaneSize = 128 / VT.getScalarSizeInBits();
7310 const int NumBytes = VT.getSizeInBits() / 8;
7311 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
7312
7313 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
7314 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
7315 (Subtarget.hasBWI() && VT.is512BitVector()));
7316
7317 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7318
7319 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
7320 // Sign bit set in i8 mask means zero element.
7321 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
7322
7323 for (int i = 0; i < NumBytes; ++i) {
7324 int M = Mask[i / NumEltBytes];
7325 if (M < 0) {
7326 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
7327 continue;
7328 }
7329 if (Zeroable[i / NumEltBytes]) {
7330 PSHUFBMask[i] = ZeroMask;
7331 continue;
7332 }
7333 // Only allow V1.
7334 if (M >= Size)
7335 return SDValue();
7336
7337 // PSHUFB can't cross lanes, ensure this doesn't happen.
7338 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
7339 return SDValue();
7340
7341 M = M % LaneSize;
7342 M = M * NumEltBytes + (i % NumEltBytes);
7343 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
7344 }
7345
7346 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
7347 return DAG.getBitcast(
7348 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
7349 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
7350 }
7351
7352 // X86 has dedicated unpack instructions that can handle specific blend
7353 // operations: UNPCKH and UNPCKL.
lowerVectorShuffleWithUNPCK(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)7354 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
7355 ArrayRef<int> Mask, SDValue V1,
7356 SDValue V2, SelectionDAG &DAG) {
7357 int NumElts = VT.getVectorNumElements();
7358 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7359 SmallVector<int, 8> Unpckl(NumElts);
7360 SmallVector<int, 8> Unpckh(NumElts);
7361
7362 for (int i = 0; i < NumElts; ++i) {
7363 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7364 int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
7365 int HiPos = LoPos + NumEltsInLane / 2;
7366 Unpckl[i] = LoPos;
7367 Unpckh[i] = HiPos;
7368 }
7369
7370 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7371 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
7372 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7373 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
7374
7375 // Commute and try again.
7376 ShuffleVectorSDNode::commuteMask(Unpckl);
7377 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7378 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
7379
7380 ShuffleVectorSDNode::commuteMask(Unpckh);
7381 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7382 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
7383
7384 return SDValue();
7385 }
7386
7387 /// \brief Try to emit a bitmask instruction for a shuffle.
7388 ///
7389 /// This handles cases where we can model a blend exactly as a bitmask due to
7390 /// one of the inputs being zeroable.
lowerVectorShuffleAsBitMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7391 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
7392 SDValue V2, ArrayRef<int> Mask,
7393 SelectionDAG &DAG) {
7394 MVT EltVT = VT.getVectorElementType();
7395 int NumEltBits = EltVT.getSizeInBits();
7396 MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7397 SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
7398 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7399 IntEltVT);
7400 if (EltVT.isFloatingPoint()) {
7401 Zero = DAG.getBitcast(EltVT, Zero);
7402 AllOnes = DAG.getBitcast(EltVT, AllOnes);
7403 }
7404 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7405 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7406 SDValue V;
7407 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7408 if (Zeroable[i])
7409 continue;
7410 if (Mask[i] % Size != i)
7411 return SDValue(); // Not a blend.
7412 if (!V)
7413 V = Mask[i] < Size ? V1 : V2;
7414 else if (V != (Mask[i] < Size ? V1 : V2))
7415 return SDValue(); // Can only let one input through the mask.
7416
7417 VMaskOps[i] = AllOnes;
7418 }
7419 if (!V)
7420 return SDValue(); // No non-zeroable elements!
7421
7422 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
7423 V = DAG.getNode(VT.isFloatingPoint()
7424 ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7425 DL, VT, V, VMask);
7426 return V;
7427 }
7428
7429 /// \brief Try to emit a blend instruction for a shuffle using bit math.
7430 ///
7431 /// This is used as a fallback approach when first class blend instructions are
7432 /// unavailable. Currently it is only suitable for integer vectors, but could
7433 /// be generalized for floating point vectors if desirable.
lowerVectorShuffleAsBitBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7434 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
7435 SDValue V2, ArrayRef<int> Mask,
7436 SelectionDAG &DAG) {
7437 assert(VT.isInteger() && "Only supports integer vector types!");
7438 MVT EltVT = VT.getVectorElementType();
7439 int NumEltBits = EltVT.getSizeInBits();
7440 SDValue Zero = DAG.getConstant(0, DL, EltVT);
7441 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7442 EltVT);
7443 SmallVector<SDValue, 16> MaskOps;
7444 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7445 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
7446 return SDValue(); // Shuffled input!
7447 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
7448 }
7449
7450 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
7451 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
7452 // We have to cast V2 around.
7453 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
7454 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
7455 DAG.getBitcast(MaskVT, V1Mask),
7456 DAG.getBitcast(MaskVT, V2)));
7457 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
7458 }
7459
7460 /// \brief Try to emit a blend instruction for a shuffle.
7461 ///
7462 /// This doesn't do any checks for the availability of instructions for blending
7463 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7464 /// be matched in the backend with the type given. What it does check for is
7465 /// that the shuffle mask is a blend, or convertible into a blend with zero.
lowerVectorShuffleAsBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Original,const X86Subtarget & Subtarget,SelectionDAG & DAG)7466 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
7467 SDValue V2, ArrayRef<int> Original,
7468 const X86Subtarget &Subtarget,
7469 SelectionDAG &DAG) {
7470 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7471 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7472 SmallVector<int, 8> Mask(Original.begin(), Original.end());
7473 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7474 bool ForceV1Zero = false, ForceV2Zero = false;
7475
7476 // Attempt to generate the binary blend mask. If an input is zero then
7477 // we can use any lane.
7478 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
7479 unsigned BlendMask = 0;
7480 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7481 int M = Mask[i];
7482 if (M < 0)
7483 continue;
7484 if (M == i)
7485 continue;
7486 if (M == i + Size) {
7487 BlendMask |= 1u << i;
7488 continue;
7489 }
7490 if (Zeroable[i]) {
7491 if (V1IsZero) {
7492 ForceV1Zero = true;
7493 Mask[i] = i;
7494 continue;
7495 }
7496 if (V2IsZero) {
7497 ForceV2Zero = true;
7498 BlendMask |= 1u << i;
7499 Mask[i] = i + Size;
7500 continue;
7501 }
7502 }
7503 return SDValue(); // Shuffled input!
7504 }
7505
7506 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
7507 if (ForceV1Zero)
7508 V1 = getZeroVector(VT, Subtarget, DAG, DL);
7509 if (ForceV2Zero)
7510 V2 = getZeroVector(VT, Subtarget, DAG, DL);
7511
7512 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
7513 unsigned ScaledMask = 0;
7514 for (int i = 0; i != Size; ++i)
7515 if (BlendMask & (1u << i))
7516 for (int j = 0; j != Scale; ++j)
7517 ScaledMask |= 1u << (i * Scale + j);
7518 return ScaledMask;
7519 };
7520
7521 switch (VT.SimpleTy) {
7522 case MVT::v2f64:
7523 case MVT::v4f32:
7524 case MVT::v4f64:
7525 case MVT::v8f32:
7526 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7527 DAG.getConstant(BlendMask, DL, MVT::i8));
7528
7529 case MVT::v4i64:
7530 case MVT::v8i32:
7531 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7532 // FALLTHROUGH
7533 case MVT::v2i64:
7534 case MVT::v4i32:
7535 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7536 // that instruction.
7537 if (Subtarget.hasAVX2()) {
7538 // Scale the blend by the number of 32-bit dwords per element.
7539 int Scale = VT.getScalarSizeInBits() / 32;
7540 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7541 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7542 V1 = DAG.getBitcast(BlendVT, V1);
7543 V2 = DAG.getBitcast(BlendVT, V2);
7544 return DAG.getBitcast(
7545 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7546 DAG.getConstant(BlendMask, DL, MVT::i8)));
7547 }
7548 // FALLTHROUGH
7549 case MVT::v8i16: {
7550 // For integer shuffles we need to expand the mask and cast the inputs to
7551 // v8i16s prior to blending.
7552 int Scale = 8 / VT.getVectorNumElements();
7553 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7554 V1 = DAG.getBitcast(MVT::v8i16, V1);
7555 V2 = DAG.getBitcast(MVT::v8i16, V2);
7556 return DAG.getBitcast(VT,
7557 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7558 DAG.getConstant(BlendMask, DL, MVT::i8)));
7559 }
7560
7561 case MVT::v16i16: {
7562 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7563 SmallVector<int, 8> RepeatedMask;
7564 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7565 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7566 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7567 BlendMask = 0;
7568 for (int i = 0; i < 8; ++i)
7569 if (RepeatedMask[i] >= 8)
7570 BlendMask |= 1u << i;
7571 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7572 DAG.getConstant(BlendMask, DL, MVT::i8));
7573 }
7574 }
7575 // FALLTHROUGH
7576 case MVT::v16i8:
7577 case MVT::v32i8: {
7578 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
7579 "256-bit byte-blends require AVX2 support!");
7580
7581 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
7582 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
7583 return Masked;
7584
7585 // Scale the blend by the number of bytes per element.
7586 int Scale = VT.getScalarSizeInBits() / 8;
7587
7588 // This form of blend is always done on bytes. Compute the byte vector
7589 // type.
7590 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
7591
7592 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7593 // mix of LLVM's code generator and the x86 backend. We tell the code
7594 // generator that boolean values in the elements of an x86 vector register
7595 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7596 // mapping a select to operand #1, and 'false' mapping to operand #2. The
7597 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7598 // of the element (the remaining are ignored) and 0 in that high bit would
7599 // mean operand #1 while 1 in the high bit would mean operand #2. So while
7600 // the LLVM model for boolean values in vector elements gets the relevant
7601 // bit set, it is set backwards and over constrained relative to x86's
7602 // actual model.
7603 SmallVector<SDValue, 32> VSELECTMask;
7604 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7605 for (int j = 0; j < Scale; ++j)
7606 VSELECTMask.push_back(
7607 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7608 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
7609 MVT::i8));
7610
7611 V1 = DAG.getBitcast(BlendVT, V1);
7612 V2 = DAG.getBitcast(BlendVT, V2);
7613 return DAG.getBitcast(
7614 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
7615 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
7616 }
7617
7618 default:
7619 llvm_unreachable("Not a supported integer vector type!");
7620 }
7621 }
7622
7623 /// \brief Try to lower as a blend of elements from two inputs followed by
7624 /// a single-input permutation.
7625 ///
7626 /// This matches the pattern where we can blend elements from two inputs and
7627 /// then reduce the shuffle to a single-input permutation.
lowerVectorShuffleAsBlendAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7628 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
7629 SDValue V1, SDValue V2,
7630 ArrayRef<int> Mask,
7631 SelectionDAG &DAG) {
7632 // We build up the blend mask while checking whether a blend is a viable way
7633 // to reduce the shuffle.
7634 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7635 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
7636
7637 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7638 if (Mask[i] < 0)
7639 continue;
7640
7641 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
7642
7643 if (BlendMask[Mask[i] % Size] < 0)
7644 BlendMask[Mask[i] % Size] = Mask[i];
7645 else if (BlendMask[Mask[i] % Size] != Mask[i])
7646 return SDValue(); // Can't blend in the needed input!
7647
7648 PermuteMask[i] = Mask[i] % Size;
7649 }
7650
7651 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7652 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
7653 }
7654
7655 /// \brief Generic routine to decompose a shuffle and blend into indepndent
7656 /// blends and permutes.
7657 ///
7658 /// This matches the extremely common pattern for handling combined
7659 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7660 /// operations. It will try to pick the best arrangement of shuffles and
7661 /// blends.
lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7662 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
7663 MVT VT, SDValue V1,
7664 SDValue V2,
7665 ArrayRef<int> Mask,
7666 SelectionDAG &DAG) {
7667 // Shuffle the input elements into the desired positions in V1 and V2 and
7668 // blend them together.
7669 SmallVector<int, 32> V1Mask(Mask.size(), -1);
7670 SmallVector<int, 32> V2Mask(Mask.size(), -1);
7671 SmallVector<int, 32> BlendMask(Mask.size(), -1);
7672 for (int i = 0, Size = Mask.size(); i < Size; ++i)
7673 if (Mask[i] >= 0 && Mask[i] < Size) {
7674 V1Mask[i] = Mask[i];
7675 BlendMask[i] = i;
7676 } else if (Mask[i] >= Size) {
7677 V2Mask[i] = Mask[i] - Size;
7678 BlendMask[i] = i + Size;
7679 }
7680
7681 // Try to lower with the simpler initial blend strategy unless one of the
7682 // input shuffles would be a no-op. We prefer to shuffle inputs as the
7683 // shuffle may be able to fold with a load or other benefit. However, when
7684 // we'll have to do 2x as many shuffles in order to achieve this, blending
7685 // first is a better strategy.
7686 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
7687 if (SDValue BlendPerm =
7688 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
7689 return BlendPerm;
7690
7691 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7692 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7693 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7694 }
7695
7696 /// \brief Try to lower a vector shuffle as a byte rotation.
7697 ///
7698 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7699 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7700 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7701 /// try to generically lower a vector shuffle through such an pattern. It
7702 /// does not check for the profitability of lowering either as PALIGNR or
7703 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7704 /// This matches shuffle vectors that look like:
7705 ///
7706 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7707 ///
7708 /// Essentially it concatenates V1 and V2, shifts right by some number of
7709 /// elements, and takes the low elements as the result. Note that while this is
7710 /// specified as a *right shift* because x86 is little-endian, it is a *left
7711 /// rotate* of the vector lanes.
lowerVectorShuffleAsByteRotate(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)7712 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
7713 SDValue V1, SDValue V2,
7714 ArrayRef<int> Mask,
7715 const X86Subtarget &Subtarget,
7716 SelectionDAG &DAG) {
7717 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7718
7719 int NumElts = Mask.size();
7720 int NumLanes = VT.getSizeInBits() / 128;
7721 int NumLaneElts = NumElts / NumLanes;
7722
7723 // We need to detect various ways of spelling a rotation:
7724 // [11, 12, 13, 14, 15, 0, 1, 2]
7725 // [-1, 12, 13, 14, -1, -1, 1, -1]
7726 // [-1, -1, -1, -1, -1, -1, 1, 2]
7727 // [ 3, 4, 5, 6, 7, 8, 9, 10]
7728 // [-1, 4, 5, 6, -1, -1, 9, -1]
7729 // [-1, 4, 5, 6, -1, -1, -1, -1]
7730 int Rotation = 0;
7731 SDValue Lo, Hi;
7732 for (int l = 0; l < NumElts; l += NumLaneElts) {
7733 for (int i = 0; i < NumLaneElts; ++i) {
7734 if (Mask[l + i] < 0)
7735 continue;
7736
7737 // Get the mod-Size index and lane correct it.
7738 int LaneIdx = (Mask[l + i] % NumElts) - l;
7739 // Make sure it was in this lane.
7740 if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
7741 return SDValue();
7742
7743 // Determine where a rotated vector would have started.
7744 int StartIdx = i - LaneIdx;
7745 if (StartIdx == 0)
7746 // The identity rotation isn't interesting, stop.
7747 return SDValue();
7748
7749 // If we found the tail of a vector the rotation must be the missing
7750 // front. If we found the head of a vector, it must be how much of the
7751 // head.
7752 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
7753
7754 if (Rotation == 0)
7755 Rotation = CandidateRotation;
7756 else if (Rotation != CandidateRotation)
7757 // The rotations don't match, so we can't match this mask.
7758 return SDValue();
7759
7760 // Compute which value this mask is pointing at.
7761 SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
7762
7763 // Compute which of the two target values this index should be assigned
7764 // to. This reflects whether the high elements are remaining or the low
7765 // elements are remaining.
7766 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7767
7768 // Either set up this value if we've not encountered it before, or check
7769 // that it remains consistent.
7770 if (!TargetV)
7771 TargetV = MaskV;
7772 else if (TargetV != MaskV)
7773 // This may be a rotation, but it pulls from the inputs in some
7774 // unsupported interleaving.
7775 return SDValue();
7776 }
7777 }
7778
7779 // Check that we successfully analyzed the mask, and normalize the results.
7780 assert(Rotation != 0 && "Failed to locate a viable rotation!");
7781 assert((Lo || Hi) && "Failed to find a rotated input vector!");
7782 if (!Lo)
7783 Lo = Hi;
7784 else if (!Hi)
7785 Hi = Lo;
7786
7787 // Cast the inputs to i8 vector of correct length to match PALIGNR or
7788 // PSLLDQ/PSRLDQ.
7789 MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
7790 Lo = DAG.getBitcast(ByteVT, Lo);
7791 Hi = DAG.getBitcast(ByteVT, Hi);
7792
7793 // The actual rotate instruction rotates bytes, so we need to scale the
7794 // rotation based on how many bytes are in the vector lane.
7795 int Scale = 16 / NumLaneElts;
7796
7797 // SSSE3 targets can use the palignr instruction.
7798 if (Subtarget.hasSSSE3()) {
7799 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
7800 "512-bit PALIGNR requires BWI instructions");
7801 return DAG.getBitcast(
7802 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
7803 DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
7804 }
7805
7806 assert(VT.is128BitVector() &&
7807 "Rotate-based lowering only supports 128-bit lowering!");
7808 assert(Mask.size() <= 16 &&
7809 "Can shuffle at most 16 bytes in a 128-bit vector!");
7810 assert(ByteVT == MVT::v16i8 &&
7811 "SSE2 rotate lowering only needed for v16i8!");
7812
7813 // Default SSE2 implementation
7814 int LoByteShift = 16 - Rotation * Scale;
7815 int HiByteShift = Rotation * Scale;
7816
7817 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
7818 DAG.getConstant(LoByteShift, DL, MVT::i8));
7819 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
7820 DAG.getConstant(HiByteShift, DL, MVT::i8));
7821 return DAG.getBitcast(VT,
7822 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
7823 }
7824
7825 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7826 ///
7827 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
7828 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
7829 /// matches elements from one of the input vectors shuffled to the left or
7830 /// right with zeroable elements 'shifted in'. It handles both the strictly
7831 /// bit-wise element shifts and the byte shift across an entire 128-bit double
7832 /// quad word lane.
7833 ///
7834 /// PSHL : (little-endian) left bit shift.
7835 /// [ zz, 0, zz, 2 ]
7836 /// [ -1, 4, zz, -1 ]
7837 /// PSRL : (little-endian) right bit shift.
7838 /// [ 1, zz, 3, zz]
7839 /// [ -1, -1, 7, zz]
7840 /// PSLLDQ : (little-endian) left byte shift
7841 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
7842 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
7843 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
7844 /// PSRLDQ : (little-endian) right byte shift
7845 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
7846 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
7847 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
lowerVectorShuffleAsShift(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)7848 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
7849 SDValue V2, ArrayRef<int> Mask,
7850 const X86Subtarget &Subtarget,
7851 SelectionDAG &DAG) {
7852 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7853
7854 int Size = Mask.size();
7855 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7856
7857 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
7858 for (int i = 0; i < Size; i += Scale)
7859 for (int j = 0; j < Shift; ++j)
7860 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
7861 return false;
7862
7863 return true;
7864 };
7865
7866 auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
7867 for (int i = 0; i != Size; i += Scale) {
7868 unsigned Pos = Left ? i + Shift : i;
7869 unsigned Low = Left ? i : i + Shift;
7870 unsigned Len = Scale - Shift;
7871 if (!isSequentialOrUndefInRange(Mask, Pos, Len,
7872 Low + (V == V1 ? 0 : Size)))
7873 return SDValue();
7874 }
7875
7876 int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
7877 bool ByteShift = ShiftEltBits > 64;
7878 unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
7879 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
7880 int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
7881
7882 // Normalize the scale for byte shifts to still produce an i64 element
7883 // type.
7884 Scale = ByteShift ? Scale / 2 : Scale;
7885
7886 // We need to round trip through the appropriate type for the shift.
7887 MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7888 MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
7889 : MVT::getVectorVT(ShiftSVT, Size / Scale);
7890 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7891 "Illegal integer vector type");
7892 V = DAG.getBitcast(ShiftVT, V);
7893
7894 V = DAG.getNode(OpCode, DL, ShiftVT, V,
7895 DAG.getConstant(ShiftAmt, DL, MVT::i8));
7896 return DAG.getBitcast(VT, V);
7897 };
7898
7899 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7900 // keep doubling the size of the integer elements up to that. We can
7901 // then shift the elements of the integer vector by whole multiples of
7902 // their width within the elements of the larger integer vector. Test each
7903 // multiple to see if we can find a match with the moved element indices
7904 // and that the shifted in elements are all zeroable.
7905 unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
7906 for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
7907 for (int Shift = 1; Shift != Scale; ++Shift)
7908 for (bool Left : {true, false})
7909 if (CheckZeros(Shift, Scale, Left))
7910 for (SDValue V : {V1, V2})
7911 if (SDValue Match = MatchShift(Shift, Scale, Left, V))
7912 return Match;
7913
7914 // no match
7915 return SDValue();
7916 }
7917
7918 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
lowerVectorShuffleWithSSE4A(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7919 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
7920 SDValue V2, ArrayRef<int> Mask,
7921 SelectionDAG &DAG) {
7922 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7923 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
7924
7925 int Size = Mask.size();
7926 int HalfSize = Size / 2;
7927 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7928
7929 // Upper half must be undefined.
7930 if (!isUndefInRange(Mask, HalfSize, HalfSize))
7931 return SDValue();
7932
7933 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
7934 // Remainder of lower half result is zero and upper half is all undef.
7935 auto LowerAsEXTRQ = [&]() {
7936 // Determine the extraction length from the part of the
7937 // lower half that isn't zeroable.
7938 int Len = HalfSize;
7939 for (; Len > 0; --Len)
7940 if (!Zeroable[Len - 1])
7941 break;
7942 assert(Len > 0 && "Zeroable shuffle mask");
7943
7944 // Attempt to match first Len sequential elements from the lower half.
7945 SDValue Src;
7946 int Idx = -1;
7947 for (int i = 0; i != Len; ++i) {
7948 int M = Mask[i];
7949 if (M < 0)
7950 continue;
7951 SDValue &V = (M < Size ? V1 : V2);
7952 M = M % Size;
7953
7954 // The extracted elements must start at a valid index and all mask
7955 // elements must be in the lower half.
7956 if (i > M || M >= HalfSize)
7957 return SDValue();
7958
7959 if (Idx < 0 || (Src == V && Idx == (M - i))) {
7960 Src = V;
7961 Idx = M - i;
7962 continue;
7963 }
7964 return SDValue();
7965 }
7966
7967 if (Idx < 0)
7968 return SDValue();
7969
7970 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
7971 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
7972 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
7973 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
7974 DAG.getConstant(BitLen, DL, MVT::i8),
7975 DAG.getConstant(BitIdx, DL, MVT::i8));
7976 };
7977
7978 if (SDValue ExtrQ = LowerAsEXTRQ())
7979 return ExtrQ;
7980
7981 // INSERTQ: Extract lowest Len elements from lower half of second source and
7982 // insert over first source, starting at Idx.
7983 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
7984 auto LowerAsInsertQ = [&]() {
7985 for (int Idx = 0; Idx != HalfSize; ++Idx) {
7986 SDValue Base;
7987
7988 // Attempt to match first source from mask before insertion point.
7989 if (isUndefInRange(Mask, 0, Idx)) {
7990 /* EMPTY */
7991 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
7992 Base = V1;
7993 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
7994 Base = V2;
7995 } else {
7996 continue;
7997 }
7998
7999 // Extend the extraction length looking to match both the insertion of
8000 // the second source and the remaining elements of the first.
8001 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8002 SDValue Insert;
8003 int Len = Hi - Idx;
8004
8005 // Match insertion.
8006 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8007 Insert = V1;
8008 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8009 Insert = V2;
8010 } else {
8011 continue;
8012 }
8013
8014 // Match the remaining elements of the lower half.
8015 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8016 /* EMPTY */
8017 } else if ((!Base || (Base == V1)) &&
8018 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8019 Base = V1;
8020 } else if ((!Base || (Base == V2)) &&
8021 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8022 Size + Hi)) {
8023 Base = V2;
8024 } else {
8025 continue;
8026 }
8027
8028 // We may not have a base (first source) - this can safely be undefined.
8029 if (!Base)
8030 Base = DAG.getUNDEF(VT);
8031
8032 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8033 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8034 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8035 DAG.getConstant(BitLen, DL, MVT::i8),
8036 DAG.getConstant(BitIdx, DL, MVT::i8));
8037 }
8038 }
8039
8040 return SDValue();
8041 };
8042
8043 if (SDValue InsertQ = LowerAsInsertQ())
8044 return InsertQ;
8045
8046 return SDValue();
8047 }
8048
8049 /// \brief Lower a vector shuffle as a zero or any extension.
8050 ///
8051 /// Given a specific number of elements, element bit width, and extension
8052 /// stride, produce either a zero or any extension based on the available
8053 /// features of the subtarget. The extended elements are consecutive and
8054 /// begin and can start from an offseted element index in the input; to
8055 /// avoid excess shuffling the offset must either being in the bottom lane
8056 /// or at the start of a higher lane. All extended elements must be from
8057 /// the same lane.
lowerVectorShuffleAsSpecificZeroOrAnyExtend(const SDLoc & DL,MVT VT,int Scale,int Offset,bool AnyExt,SDValue InputV,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8058 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8059 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8060 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8061 assert(Scale > 1 && "Need a scale to extend.");
8062 int EltBits = VT.getScalarSizeInBits();
8063 int NumElements = VT.getVectorNumElements();
8064 int NumEltsPerLane = 128 / EltBits;
8065 int OffsetLane = Offset / NumEltsPerLane;
8066 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8067 "Only 8, 16, and 32 bit elements can be extended.");
8068 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8069 assert(0 <= Offset && "Extension offset must be positive.");
8070 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8071 "Extension offset must be in the first lane or start an upper lane.");
8072
8073 // Check that an index is in same lane as the base offset.
8074 auto SafeOffset = [&](int Idx) {
8075 return OffsetLane == (Idx / NumEltsPerLane);
8076 };
8077
8078 // Shift along an input so that the offset base moves to the first element.
8079 auto ShuffleOffset = [&](SDValue V) {
8080 if (!Offset)
8081 return V;
8082
8083 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8084 for (int i = 0; i * Scale < NumElements; ++i) {
8085 int SrcIdx = i + Offset;
8086 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8087 }
8088 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8089 };
8090
8091 // Found a valid zext mask! Try various lowering strategies based on the
8092 // input type and available ISA extensions.
8093 if (Subtarget.hasSSE41()) {
8094 // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8095 // PUNPCK will catch this in a later shuffle match.
8096 if (Offset && Scale == 2 && VT.is128BitVector())
8097 return SDValue();
8098 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8099 NumElements / Scale);
8100 InputV = ShuffleOffset(InputV);
8101
8102 // For 256-bit vectors, we only need the lower (128-bit) input half.
8103 if (VT.is256BitVector())
8104 InputV = extract128BitVector(InputV, 0, DAG, DL);
8105
8106 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8107 return DAG.getBitcast(VT, InputV);
8108 }
8109
8110 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8111
8112 // For any extends we can cheat for larger element sizes and use shuffle
8113 // instructions that can fold with a load and/or copy.
8114 if (AnyExt && EltBits == 32) {
8115 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8116 -1};
8117 return DAG.getBitcast(
8118 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8119 DAG.getBitcast(MVT::v4i32, InputV),
8120 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8121 }
8122 if (AnyExt && EltBits == 16 && Scale > 2) {
8123 int PSHUFDMask[4] = {Offset / 2, -1,
8124 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8125 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8126 DAG.getBitcast(MVT::v4i32, InputV),
8127 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8128 int PSHUFWMask[4] = {1, -1, -1, -1};
8129 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8130 return DAG.getBitcast(
8131 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8132 DAG.getBitcast(MVT::v8i16, InputV),
8133 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8134 }
8135
8136 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8137 // to 64-bits.
8138 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8139 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8140 assert(VT.is128BitVector() && "Unexpected vector width!");
8141
8142 int LoIdx = Offset * EltBits;
8143 SDValue Lo = DAG.getBitcast(
8144 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8145 DAG.getConstant(EltBits, DL, MVT::i8),
8146 DAG.getConstant(LoIdx, DL, MVT::i8)));
8147
8148 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8149 !SafeOffset(Offset + 1))
8150 return DAG.getBitcast(VT, Lo);
8151
8152 int HiIdx = (Offset + 1) * EltBits;
8153 SDValue Hi = DAG.getBitcast(
8154 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8155 DAG.getConstant(EltBits, DL, MVT::i8),
8156 DAG.getConstant(HiIdx, DL, MVT::i8)));
8157 return DAG.getBitcast(VT,
8158 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8159 }
8160
8161 // If this would require more than 2 unpack instructions to expand, use
8162 // pshufb when available. We can only use more than 2 unpack instructions
8163 // when zero extending i8 elements which also makes it easier to use pshufb.
8164 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8165 assert(NumElements == 16 && "Unexpected byte vector width!");
8166 SDValue PSHUFBMask[16];
8167 for (int i = 0; i < 16; ++i) {
8168 int Idx = Offset + (i / Scale);
8169 PSHUFBMask[i] = DAG.getConstant(
8170 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8171 }
8172 InputV = DAG.getBitcast(MVT::v16i8, InputV);
8173 return DAG.getBitcast(
8174 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8175 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8176 }
8177
8178 // If we are extending from an offset, ensure we start on a boundary that
8179 // we can unpack from.
8180 int AlignToUnpack = Offset % (NumElements / Scale);
8181 if (AlignToUnpack) {
8182 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8183 for (int i = AlignToUnpack; i < NumElements; ++i)
8184 ShMask[i - AlignToUnpack] = i;
8185 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8186 Offset -= AlignToUnpack;
8187 }
8188
8189 // Otherwise emit a sequence of unpacks.
8190 do {
8191 unsigned UnpackLoHi = X86ISD::UNPCKL;
8192 if (Offset >= (NumElements / 2)) {
8193 UnpackLoHi = X86ISD::UNPCKH;
8194 Offset -= (NumElements / 2);
8195 }
8196
8197 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8198 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8199 : getZeroVector(InputVT, Subtarget, DAG, DL);
8200 InputV = DAG.getBitcast(InputVT, InputV);
8201 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8202 Scale /= 2;
8203 EltBits *= 2;
8204 NumElements /= 2;
8205 } while (Scale > 1);
8206 return DAG.getBitcast(VT, InputV);
8207 }
8208
8209 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8210 ///
8211 /// This routine will try to do everything in its power to cleverly lower
8212 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8213 /// check for the profitability of this lowering, it tries to aggressively
8214 /// match this pattern. It will use all of the micro-architectural details it
8215 /// can to emit an efficient lowering. It handles both blends with all-zero
8216 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8217 /// masking out later).
8218 ///
8219 /// The reason we have dedicated lowering for zext-style shuffles is that they
8220 /// are both incredibly common and often quite performance sensitive.
lowerVectorShuffleAsZeroOrAnyExtend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8221 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8222 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8223 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8224 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8225
8226 int Bits = VT.getSizeInBits();
8227 int NumLanes = Bits / 128;
8228 int NumElements = VT.getVectorNumElements();
8229 int NumEltsPerLane = NumElements / NumLanes;
8230 assert(VT.getScalarSizeInBits() <= 32 &&
8231 "Exceeds 32-bit integer zero extension limit");
8232 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8233
8234 // Define a helper function to check a particular ext-scale and lower to it if
8235 // valid.
8236 auto Lower = [&](int Scale) -> SDValue {
8237 SDValue InputV;
8238 bool AnyExt = true;
8239 int Offset = 0;
8240 int Matches = 0;
8241 for (int i = 0; i < NumElements; ++i) {
8242 int M = Mask[i];
8243 if (M < 0)
8244 continue; // Valid anywhere but doesn't tell us anything.
8245 if (i % Scale != 0) {
8246 // Each of the extended elements need to be zeroable.
8247 if (!Zeroable[i])
8248 return SDValue();
8249
8250 // We no longer are in the anyext case.
8251 AnyExt = false;
8252 continue;
8253 }
8254
8255 // Each of the base elements needs to be consecutive indices into the
8256 // same input vector.
8257 SDValue V = M < NumElements ? V1 : V2;
8258 M = M % NumElements;
8259 if (!InputV) {
8260 InputV = V;
8261 Offset = M - (i / Scale);
8262 } else if (InputV != V)
8263 return SDValue(); // Flip-flopping inputs.
8264
8265 // Offset must start in the lowest 128-bit lane or at the start of an
8266 // upper lane.
8267 // FIXME: Is it ever worth allowing a negative base offset?
8268 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
8269 (Offset % NumEltsPerLane) == 0))
8270 return SDValue();
8271
8272 // If we are offsetting, all referenced entries must come from the same
8273 // lane.
8274 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
8275 return SDValue();
8276
8277 if ((M % NumElements) != (Offset + (i / Scale)))
8278 return SDValue(); // Non-consecutive strided elements.
8279 Matches++;
8280 }
8281
8282 // If we fail to find an input, we have a zero-shuffle which should always
8283 // have already been handled.
8284 // FIXME: Maybe handle this here in case during blending we end up with one?
8285 if (!InputV)
8286 return SDValue();
8287
8288 // If we are offsetting, don't extend if we only match a single input, we
8289 // can always do better by using a basic PSHUF or PUNPCK.
8290 if (Offset != 0 && Matches < 2)
8291 return SDValue();
8292
8293 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8294 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
8295 };
8296
8297 // The widest scale possible for extending is to a 64-bit integer.
8298 assert(Bits % 64 == 0 &&
8299 "The number of bits in a vector must be divisible by 64 on x86!");
8300 int NumExtElements = Bits / 64;
8301
8302 // Each iteration, try extending the elements half as much, but into twice as
8303 // many elements.
8304 for (; NumExtElements < NumElements; NumExtElements *= 2) {
8305 assert(NumElements % NumExtElements == 0 &&
8306 "The input vector size must be divisible by the extended size.");
8307 if (SDValue V = Lower(NumElements / NumExtElements))
8308 return V;
8309 }
8310
8311 // General extends failed, but 128-bit vectors may be able to use MOVQ.
8312 if (Bits != 128)
8313 return SDValue();
8314
8315 // Returns one of the source operands if the shuffle can be reduced to a
8316 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8317 auto CanZExtLowHalf = [&]() {
8318 for (int i = NumElements / 2; i != NumElements; ++i)
8319 if (!Zeroable[i])
8320 return SDValue();
8321 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8322 return V1;
8323 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8324 return V2;
8325 return SDValue();
8326 };
8327
8328 if (SDValue V = CanZExtLowHalf()) {
8329 V = DAG.getBitcast(MVT::v2i64, V);
8330 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8331 return DAG.getBitcast(VT, V);
8332 }
8333
8334 // No viable ext lowering found.
8335 return SDValue();
8336 }
8337
8338 /// \brief Try to get a scalar value for a specific element of a vector.
8339 ///
8340 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
getScalarValueForVectorElement(SDValue V,int Idx,SelectionDAG & DAG)8341 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8342 SelectionDAG &DAG) {
8343 MVT VT = V.getSimpleValueType();
8344 MVT EltVT = VT.getVectorElementType();
8345 V = peekThroughBitcasts(V);
8346
8347 // If the bitcasts shift the element size, we can't extract an equivalent
8348 // element from it.
8349 MVT NewVT = V.getSimpleValueType();
8350 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8351 return SDValue();
8352
8353 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8354 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
8355 // Ensure the scalar operand is the same size as the destination.
8356 // FIXME: Add support for scalar truncation where possible.
8357 SDValue S = V.getOperand(Idx);
8358 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
8359 return DAG.getBitcast(EltVT, S);
8360 }
8361
8362 return SDValue();
8363 }
8364
8365 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8366 ///
8367 /// This is particularly important because the set of instructions varies
8368 /// significantly based on whether the operand is a load or not.
isShuffleFoldableLoad(SDValue V)8369 static bool isShuffleFoldableLoad(SDValue V) {
8370 V = peekThroughBitcasts(V);
8371 return ISD::isNON_EXTLoad(V.getNode());
8372 }
8373
8374 /// \brief Try to lower insertion of a single element into a zero vector.
8375 ///
8376 /// This is a common pattern that we have especially efficient patterns to lower
8377 /// across all subtarget feature sets.
lowerVectorShuffleAsElementInsertion(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8378 static SDValue lowerVectorShuffleAsElementInsertion(
8379 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8380 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8381 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8382 MVT ExtVT = VT;
8383 MVT EltVT = VT.getVectorElementType();
8384
8385 int V2Index = std::find_if(Mask.begin(), Mask.end(),
8386 [&Mask](int M) { return M >= (int)Mask.size(); }) -
8387 Mask.begin();
8388 bool IsV1Zeroable = true;
8389 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8390 if (i != V2Index && !Zeroable[i]) {
8391 IsV1Zeroable = false;
8392 break;
8393 }
8394
8395 // Check for a single input from a SCALAR_TO_VECTOR node.
8396 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8397 // all the smarts here sunk into that routine. However, the current
8398 // lowering of BUILD_VECTOR makes that nearly impossible until the old
8399 // vector shuffle lowering is dead.
8400 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
8401 DAG);
8402 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
8403 // We need to zext the scalar if it is smaller than an i32.
8404 V2S = DAG.getBitcast(EltVT, V2S);
8405 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8406 // Using zext to expand a narrow element won't work for non-zero
8407 // insertions.
8408 if (!IsV1Zeroable)
8409 return SDValue();
8410
8411 // Zero-extend directly to i32.
8412 ExtVT = MVT::v4i32;
8413 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8414 }
8415 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8416 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8417 EltVT == MVT::i16) {
8418 // Either not inserting from the low element of the input or the input
8419 // element size is too small to use VZEXT_MOVL to clear the high bits.
8420 return SDValue();
8421 }
8422
8423 if (!IsV1Zeroable) {
8424 // If V1 can't be treated as a zero vector we have fewer options to lower
8425 // this. We can't support integer vectors or non-zero targets cheaply, and
8426 // the V1 elements can't be permuted in any way.
8427 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8428 if (!VT.isFloatingPoint() || V2Index != 0)
8429 return SDValue();
8430 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8431 V1Mask[V2Index] = -1;
8432 if (!isNoopShuffleMask(V1Mask))
8433 return SDValue();
8434 // This is essentially a special case blend operation, but if we have
8435 // general purpose blend operations, they are always faster. Bail and let
8436 // the rest of the lowering handle these as blends.
8437 if (Subtarget.hasSSE41())
8438 return SDValue();
8439
8440 // Otherwise, use MOVSD or MOVSS.
8441 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8442 "Only two types of floating point element types to handle!");
8443 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8444 ExtVT, V1, V2);
8445 }
8446
8447 // This lowering only works for the low element with floating point vectors.
8448 if (VT.isFloatingPoint() && V2Index != 0)
8449 return SDValue();
8450
8451 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8452 if (ExtVT != VT)
8453 V2 = DAG.getBitcast(VT, V2);
8454
8455 if (V2Index != 0) {
8456 // If we have 4 or fewer lanes we can cheaply shuffle the element into
8457 // the desired position. Otherwise it is more efficient to do a vector
8458 // shift left. We know that we can do a vector shift left because all
8459 // the inputs are zero.
8460 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8461 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8462 V2Shuffle[V2Index] = 0;
8463 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8464 } else {
8465 V2 = DAG.getBitcast(MVT::v16i8, V2);
8466 V2 = DAG.getNode(
8467 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
8468 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
8469 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
8470 DAG.getDataLayout(), VT)));
8471 V2 = DAG.getBitcast(VT, V2);
8472 }
8473 }
8474 return V2;
8475 }
8476
8477 /// Try to lower broadcast of a single - truncated - integer element,
8478 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
8479 ///
8480 /// This assumes we have AVX2.
lowerVectorShuffleAsTruncBroadcast(const SDLoc & DL,MVT VT,SDValue V0,int BroadcastIdx,const X86Subtarget & Subtarget,SelectionDAG & DAG)8481 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
8482 SDValue V0, int BroadcastIdx,
8483 const X86Subtarget &Subtarget,
8484 SelectionDAG &DAG) {
8485 assert(Subtarget.hasAVX2() &&
8486 "We can only lower integer broadcasts with AVX2!");
8487
8488 EVT EltVT = VT.getVectorElementType();
8489 EVT V0VT = V0.getValueType();
8490
8491 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
8492 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
8493
8494 EVT V0EltVT = V0VT.getVectorElementType();
8495 if (!V0EltVT.isInteger())
8496 return SDValue();
8497
8498 const unsigned EltSize = EltVT.getSizeInBits();
8499 const unsigned V0EltSize = V0EltVT.getSizeInBits();
8500
8501 // This is only a truncation if the original element type is larger.
8502 if (V0EltSize <= EltSize)
8503 return SDValue();
8504
8505 assert(((V0EltSize % EltSize) == 0) &&
8506 "Scalar type sizes must all be powers of 2 on x86!");
8507
8508 const unsigned V0Opc = V0.getOpcode();
8509 const unsigned Scale = V0EltSize / EltSize;
8510 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
8511
8512 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
8513 V0Opc != ISD::BUILD_VECTOR)
8514 return SDValue();
8515
8516 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
8517
8518 // If we're extracting non-least-significant bits, shift so we can truncate.
8519 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
8520 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
8521 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
8522 if (const int OffsetIdx = BroadcastIdx % Scale)
8523 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
8524 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
8525
8526 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
8527 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
8528 }
8529
8530 /// \brief Try to lower broadcast of a single element.
8531 ///
8532 /// For convenience, this code also bundles all of the subtarget feature set
8533 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8534 /// a convenient way to factor it out.
8535 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
lowerVectorShuffleAsBroadcast(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8536 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
8537 SDValue V1, SDValue V2,
8538 ArrayRef<int> Mask,
8539 const X86Subtarget &Subtarget,
8540 SelectionDAG &DAG) {
8541 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
8542 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
8543 (Subtarget.hasAVX2() && VT.isInteger())))
8544 return SDValue();
8545
8546 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
8547 // we can only broadcast from a register with AVX2.
8548 unsigned NumElts = Mask.size();
8549 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
8550 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
8551
8552 // Check that the mask is a broadcast.
8553 int BroadcastIdx = -1;
8554 for (int i = 0; i != (int)NumElts; ++i) {
8555 SmallVector<int, 8> BroadcastMask(NumElts, i);
8556 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
8557 BroadcastIdx = i;
8558 break;
8559 }
8560 }
8561
8562 if (BroadcastIdx < 0)
8563 return SDValue();
8564 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8565 "a sorted mask where the broadcast "
8566 "comes from V1.");
8567
8568 // Go up the chain of (vector) values to find a scalar load that we can
8569 // combine with the broadcast.
8570 SDValue V = V1;
8571 for (;;) {
8572 switch (V.getOpcode()) {
8573 case ISD::BITCAST: {
8574 SDValue VSrc = V.getOperand(0);
8575 MVT SrcVT = VSrc.getSimpleValueType();
8576 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
8577 break;
8578 V = VSrc;
8579 continue;
8580 }
8581 case ISD::CONCAT_VECTORS: {
8582 int OperandSize = Mask.size() / V.getNumOperands();
8583 V = V.getOperand(BroadcastIdx / OperandSize);
8584 BroadcastIdx %= OperandSize;
8585 continue;
8586 }
8587 case ISD::INSERT_SUBVECTOR: {
8588 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8589 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8590 if (!ConstantIdx)
8591 break;
8592
8593 int BeginIdx = (int)ConstantIdx->getZExtValue();
8594 int EndIdx =
8595 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
8596 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8597 BroadcastIdx -= BeginIdx;
8598 V = VInner;
8599 } else {
8600 V = VOuter;
8601 }
8602 continue;
8603 }
8604 }
8605 break;
8606 }
8607
8608 // Check if this is a broadcast of a scalar. We special case lowering
8609 // for scalars so that we can more effectively fold with loads.
8610 // First, look through bitcast: if the original value has a larger element
8611 // type than the shuffle, the broadcast element is in essence truncated.
8612 // Make that explicit to ease folding.
8613 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
8614 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
8615 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
8616 return TruncBroadcast;
8617
8618 MVT BroadcastVT = VT;
8619
8620 // Peek through any bitcast (only useful for loads).
8621 SDValue BC = peekThroughBitcasts(V);
8622
8623 // Also check the simpler case, where we can directly reuse the scalar.
8624 if (V.getOpcode() == ISD::BUILD_VECTOR ||
8625 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8626 V = V.getOperand(BroadcastIdx);
8627
8628 // If we can't broadcast from a register, check that the input is a load.
8629 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
8630 return SDValue();
8631 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
8632 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
8633 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
8634 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
8635 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
8636 }
8637
8638 // If we are broadcasting a load that is only used by the shuffle
8639 // then we can reduce the vector load to the broadcasted scalar load.
8640 LoadSDNode *Ld = cast<LoadSDNode>(BC);
8641 SDValue BaseAddr = Ld->getOperand(1);
8642 EVT SVT = BroadcastVT.getScalarType();
8643 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
8644 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
8645 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
8646 DAG.getMachineFunction().getMachineMemOperand(
8647 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
8648 } else if (!BroadcastFromReg) {
8649 // We can't broadcast from a vector register.
8650 return SDValue();
8651 } else if (BroadcastIdx != 0) {
8652 // We can only broadcast from the zero-element of a vector register,
8653 // but it can be advantageous to broadcast from the zero-element of a
8654 // subvector.
8655 if (!VT.is256BitVector() && !VT.is512BitVector())
8656 return SDValue();
8657
8658 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
8659 if (VT == MVT::v4f64 || VT == MVT::v4i64)
8660 return SDValue();
8661
8662 // Only broadcast the zero-element of a 128-bit subvector.
8663 unsigned EltSize = VT.getScalarSizeInBits();
8664 if (((BroadcastIdx * EltSize) % 128) != 0)
8665 return SDValue();
8666
8667 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
8668 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
8669 DAG.getIntPtrConstant(BroadcastIdx, DL));
8670 }
8671
8672 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
8673 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
8674 DAG.getBitcast(MVT::f64, V));
8675
8676 // Bitcast back to the same scalar type as BroadcastVT.
8677 MVT SrcVT = V.getSimpleValueType();
8678 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
8679 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
8680 "Unexpected vector element size");
8681 if (SrcVT.isVector()) {
8682 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8683 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
8684 } else {
8685 SrcVT = BroadcastVT.getScalarType();
8686 }
8687 V = DAG.getBitcast(SrcVT, V);
8688 }
8689
8690 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
8691 }
8692
8693 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8694 // INSERTPS when the V1 elements are already in the correct locations
8695 // because otherwise we can just always use two SHUFPS instructions which
8696 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8697 // perform INSERTPS if a single V1 element is out of place and all V2
8698 // elements are zeroable.
matchVectorShuffleAsInsertPS(SDValue & V1,SDValue & V2,unsigned & InsertPSMask,const SmallBitVector & Zeroable,ArrayRef<int> Mask,SelectionDAG & DAG)8699 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
8700 unsigned &InsertPSMask,
8701 const SmallBitVector &Zeroable,
8702 ArrayRef<int> Mask,
8703 SelectionDAG &DAG) {
8704 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
8705 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
8706 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8707 unsigned ZMask = 0;
8708 int V1DstIndex = -1;
8709 int V2DstIndex = -1;
8710 bool V1UsedInPlace = false;
8711
8712 for (int i = 0; i < 4; ++i) {
8713 // Synthesize a zero mask from the zeroable elements (includes undefs).
8714 if (Zeroable[i]) {
8715 ZMask |= 1 << i;
8716 continue;
8717 }
8718
8719 // Flag if we use any V1 inputs in place.
8720 if (i == Mask[i]) {
8721 V1UsedInPlace = true;
8722 continue;
8723 }
8724
8725 // We can only insert a single non-zeroable element.
8726 if (V1DstIndex >= 0 || V2DstIndex >= 0)
8727 return false;
8728
8729 if (Mask[i] < 4) {
8730 // V1 input out of place for insertion.
8731 V1DstIndex = i;
8732 } else {
8733 // V2 input for insertion.
8734 V2DstIndex = i;
8735 }
8736 }
8737
8738 // Don't bother if we have no (non-zeroable) element for insertion.
8739 if (V1DstIndex < 0 && V2DstIndex < 0)
8740 return false;
8741
8742 // Determine element insertion src/dst indices. The src index is from the
8743 // start of the inserted vector, not the start of the concatenated vector.
8744 unsigned V2SrcIndex = 0;
8745 if (V1DstIndex >= 0) {
8746 // If we have a V1 input out of place, we use V1 as the V2 element insertion
8747 // and don't use the original V2 at all.
8748 V2SrcIndex = Mask[V1DstIndex];
8749 V2DstIndex = V1DstIndex;
8750 V2 = V1;
8751 } else {
8752 V2SrcIndex = Mask[V2DstIndex] - 4;
8753 }
8754
8755 // If no V1 inputs are used in place, then the result is created only from
8756 // the zero mask and the V2 insertion - so remove V1 dependency.
8757 if (!V1UsedInPlace)
8758 V1 = DAG.getUNDEF(MVT::v4f32);
8759
8760 // Insert the V2 element into the desired position.
8761 InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8762 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8763 return true;
8764 }
8765
lowerVectorShuffleAsInsertPS(const SDLoc & DL,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)8766 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
8767 SDValue V2, ArrayRef<int> Mask,
8768 SelectionDAG &DAG) {
8769 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8770 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8771 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8772
8773 // Attempt to match the insertps pattern.
8774 unsigned InsertPSMask;
8775 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
8776 return SDValue();
8777
8778 // Insert the V2 element into the desired position.
8779 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8780 DAG.getConstant(InsertPSMask, DL, MVT::i8));
8781 }
8782
8783 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
8784 /// UNPCK instruction.
8785 ///
8786 /// This specifically targets cases where we end up with alternating between
8787 /// the two inputs, and so can permute them into something that feeds a single
8788 /// UNPCK instruction. Note that this routine only targets integer vectors
8789 /// because for floating point vectors we have a generalized SHUFPS lowering
8790 /// strategy that handles everything that doesn't *exactly* match an unpack,
8791 /// making this clever lowering unnecessary.
lowerVectorShuffleAsPermuteAndUnpack(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)8792 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
8793 SDValue V1, SDValue V2,
8794 ArrayRef<int> Mask,
8795 SelectionDAG &DAG) {
8796 assert(!VT.isFloatingPoint() &&
8797 "This routine only supports integer vectors.");
8798 assert(VT.is128BitVector() &&
8799 "This routine only works on 128-bit vectors.");
8800 assert(!V2.isUndef() &&
8801 "This routine should only be used when blending two inputs.");
8802 assert(Mask.size() >= 2 && "Single element masks are invalid.");
8803
8804 int Size = Mask.size();
8805
8806 int NumLoInputs =
8807 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
8808 int NumHiInputs =
8809 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
8810
8811 bool UnpackLo = NumLoInputs >= NumHiInputs;
8812
8813 auto TryUnpack = [&](MVT UnpackVT, int Scale) {
8814 SmallVector<int, 16> V1Mask(Mask.size(), -1);
8815 SmallVector<int, 16> V2Mask(Mask.size(), -1);
8816
8817 for (int i = 0; i < Size; ++i) {
8818 if (Mask[i] < 0)
8819 continue;
8820
8821 // Each element of the unpack contains Scale elements from this mask.
8822 int UnpackIdx = i / Scale;
8823
8824 // We only handle the case where V1 feeds the first slots of the unpack.
8825 // We rely on canonicalization to ensure this is the case.
8826 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
8827 return SDValue();
8828
8829 // Setup the mask for this input. The indexing is tricky as we have to
8830 // handle the unpack stride.
8831 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
8832 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
8833 Mask[i] % Size;
8834 }
8835
8836 // If we will have to shuffle both inputs to use the unpack, check whether
8837 // we can just unpack first and shuffle the result. If so, skip this unpack.
8838 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
8839 !isNoopShuffleMask(V2Mask))
8840 return SDValue();
8841
8842 // Shuffle the inputs into place.
8843 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8844 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8845
8846 // Cast the inputs to the type we will use to unpack them.
8847 V1 = DAG.getBitcast(UnpackVT, V1);
8848 V2 = DAG.getBitcast(UnpackVT, V2);
8849
8850 // Unpack the inputs and cast the result back to the desired type.
8851 return DAG.getBitcast(
8852 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
8853 UnpackVT, V1, V2));
8854 };
8855
8856 // We try each unpack from the largest to the smallest to try and find one
8857 // that fits this mask.
8858 int OrigNumElements = VT.getVectorNumElements();
8859 int OrigScalarSize = VT.getScalarSizeInBits();
8860 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
8861 int Scale = ScalarSize / OrigScalarSize;
8862 int NumElements = OrigNumElements / Scale;
8863 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
8864 if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
8865 return Unpack;
8866 }
8867
8868 // If none of the unpack-rooted lowerings worked (or were profitable) try an
8869 // initial unpack.
8870 if (NumLoInputs == 0 || NumHiInputs == 0) {
8871 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
8872 "We have to have *some* inputs!");
8873 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
8874
8875 // FIXME: We could consider the total complexity of the permute of each
8876 // possible unpacking. Or at the least we should consider how many
8877 // half-crossings are created.
8878 // FIXME: We could consider commuting the unpacks.
8879
8880 SmallVector<int, 32> PermMask((unsigned)Size, -1);
8881 for (int i = 0; i < Size; ++i) {
8882 if (Mask[i] < 0)
8883 continue;
8884
8885 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
8886
8887 PermMask[i] =
8888 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
8889 }
8890 return DAG.getVectorShuffle(
8891 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
8892 DL, VT, V1, V2),
8893 DAG.getUNDEF(VT), PermMask);
8894 }
8895
8896 return SDValue();
8897 }
8898
8899 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8900 ///
8901 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8902 /// support for floating point shuffles but not integer shuffles. These
8903 /// instructions will incur a domain crossing penalty on some chips though so
8904 /// it is better to avoid lowering through this for integer vectors where
8905 /// possible.
lowerV2F64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)8906 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8907 SDValue V1, SDValue V2,
8908 const X86Subtarget &Subtarget,
8909 SelectionDAG &DAG) {
8910 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8911 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8912 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8913
8914 if (V2.isUndef()) {
8915 // Check for being able to broadcast a single element.
8916 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8917 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8918 return Broadcast;
8919
8920 // Straight shuffle of a single input vector. Simulate this by using the
8921 // single input as both of the "inputs" to this instruction..
8922 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8923
8924 if (Subtarget.hasAVX()) {
8925 // If we have AVX, we can use VPERMILPS which will allow folding a load
8926 // into the shuffle.
8927 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8928 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8929 }
8930
8931 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
8932 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8933 }
8934 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8935 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8936
8937 // If we have a single input, insert that into V1 if we can do so cheaply.
8938 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8939 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8940 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8941 return Insertion;
8942 // Try inverting the insertion since for v2 masks it is easy to do and we
8943 // can't reliably sort the mask one way or the other.
8944 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8945 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8946 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8947 DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
8948 return Insertion;
8949 }
8950
8951 // Try to use one of the special instruction patterns to handle two common
8952 // blend patterns if a zero-blend above didn't work.
8953 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
8954 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
8955 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8956 // We can either use a special instruction to load over the low double or
8957 // to move just the low double.
8958 return DAG.getNode(
8959 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8960 DL, MVT::v2f64, V2,
8961 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8962
8963 if (Subtarget.hasSSE41())
8964 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8965 Subtarget, DAG))
8966 return Blend;
8967
8968 // Use dedicated unpack instructions for masks that match their pattern.
8969 if (SDValue V =
8970 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
8971 return V;
8972
8973 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8974 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
8975 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8976 }
8977
8978 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8979 ///
8980 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8981 /// the integer unit to minimize domain crossing penalties. However, for blends
8982 /// it falls back to the floating point shuffle operation with appropriate bit
8983 /// casting.
lowerV2I64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)8984 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8985 SDValue V1, SDValue V2,
8986 const X86Subtarget &Subtarget,
8987 SelectionDAG &DAG) {
8988 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8989 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8990 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8991
8992 if (V2.isUndef()) {
8993 // Check for being able to broadcast a single element.
8994 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8995 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8996 return Broadcast;
8997
8998 // Straight shuffle of a single input vector. For everything from SSE2
8999 // onward this has a single fast instruction with no scary immediates.
9000 // We have to map the mask as it is actually a v4i32 shuffle instruction.
9001 V1 = DAG.getBitcast(MVT::v4i32, V1);
9002 int WidenedMask[4] = {
9003 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9004 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9005 return DAG.getBitcast(
9006 MVT::v2i64,
9007 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9008 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9009 }
9010 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9011 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9012 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9013 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9014
9015 // If we have a blend of two same-type PACKUS operations and the blend aligns
9016 // with the low and high halves, we can just merge the PACKUS operations.
9017 // This is particularly important as it lets us merge shuffles that this
9018 // routine itself creates.
9019 auto GetPackNode = [](SDValue V) {
9020 V = peekThroughBitcasts(V);
9021 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9022 };
9023 if (SDValue V1Pack = GetPackNode(V1))
9024 if (SDValue V2Pack = GetPackNode(V2)) {
9025 EVT PackVT = V1Pack.getValueType();
9026 if (PackVT == V2Pack.getValueType())
9027 return DAG.getBitcast(MVT::v2i64,
9028 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9029 Mask[0] == 0 ? V1Pack.getOperand(0)
9030 : V1Pack.getOperand(1),
9031 Mask[1] == 2 ? V2Pack.getOperand(0)
9032 : V2Pack.getOperand(1)));
9033 }
9034
9035 // Try to use shift instructions.
9036 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9037 Subtarget, DAG))
9038 return Shift;
9039
9040 // When loading a scalar and then shuffling it into a vector we can often do
9041 // the insertion cheaply.
9042 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9043 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9044 return Insertion;
9045 // Try inverting the insertion since for v2 masks it is easy to do and we
9046 // can't reliably sort the mask one way or the other.
9047 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9048 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9049 DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
9050 return Insertion;
9051
9052 // We have different paths for blend lowering, but they all must use the
9053 // *exact* same predicate.
9054 bool IsBlendSupported = Subtarget.hasSSE41();
9055 if (IsBlendSupported)
9056 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9057 Subtarget, DAG))
9058 return Blend;
9059
9060 // Use dedicated unpack instructions for masks that match their pattern.
9061 if (SDValue V =
9062 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9063 return V;
9064
9065 // Try to use byte rotation instructions.
9066 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9067 if (Subtarget.hasSSSE3())
9068 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9069 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9070 return Rotate;
9071
9072 // If we have direct support for blends, we should lower by decomposing into
9073 // a permute. That will be faster than the domain cross.
9074 if (IsBlendSupported)
9075 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9076 Mask, DAG);
9077
9078 // We implement this with SHUFPD which is pretty lame because it will likely
9079 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9080 // However, all the alternatives are still more cycles and newer chips don't
9081 // have this problem. It would be really nice if x86 had better shuffles here.
9082 V1 = DAG.getBitcast(MVT::v2f64, V1);
9083 V2 = DAG.getBitcast(MVT::v2f64, V2);
9084 return DAG.getBitcast(MVT::v2i64,
9085 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9086 }
9087
9088 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9089 ///
9090 /// This is used to disable more specialized lowerings when the shufps lowering
9091 /// will happen to be efficient.
isSingleSHUFPSMask(ArrayRef<int> Mask)9092 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9093 // This routine only handles 128-bit shufps.
9094 assert(Mask.size() == 4 && "Unsupported mask size!");
9095 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9096 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9097 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9098 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9099
9100 // To lower with a single SHUFPS we need to have the low half and high half
9101 // each requiring a single input.
9102 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9103 return false;
9104 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9105 return false;
9106
9107 return true;
9108 }
9109
9110 /// \brief Lower a vector shuffle using the SHUFPS instruction.
9111 ///
9112 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9113 /// It makes no assumptions about whether this is the *best* lowering, it simply
9114 /// uses it.
lowerVectorShuffleWithSHUFPS(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)9115 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9116 ArrayRef<int> Mask, SDValue V1,
9117 SDValue V2, SelectionDAG &DAG) {
9118 SDValue LowV = V1, HighV = V2;
9119 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9120
9121 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9122
9123 if (NumV2Elements == 1) {
9124 int V2Index =
9125 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
9126 Mask.begin();
9127
9128 // Compute the index adjacent to V2Index and in the same half by toggling
9129 // the low bit.
9130 int V2AdjIndex = V2Index ^ 1;
9131
9132 if (Mask[V2AdjIndex] < 0) {
9133 // Handles all the cases where we have a single V2 element and an undef.
9134 // This will only ever happen in the high lanes because we commute the
9135 // vector otherwise.
9136 if (V2Index < 2)
9137 std::swap(LowV, HighV);
9138 NewMask[V2Index] -= 4;
9139 } else {
9140 // Handle the case where the V2 element ends up adjacent to a V1 element.
9141 // To make this work, blend them together as the first step.
9142 int V1Index = V2AdjIndex;
9143 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9144 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9145 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9146
9147 // Now proceed to reconstruct the final blend as we have the necessary
9148 // high or low half formed.
9149 if (V2Index < 2) {
9150 LowV = V2;
9151 HighV = V1;
9152 } else {
9153 HighV = V2;
9154 }
9155 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9156 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9157 }
9158 } else if (NumV2Elements == 2) {
9159 if (Mask[0] < 4 && Mask[1] < 4) {
9160 // Handle the easy case where we have V1 in the low lanes and V2 in the
9161 // high lanes.
9162 NewMask[2] -= 4;
9163 NewMask[3] -= 4;
9164 } else if (Mask[2] < 4 && Mask[3] < 4) {
9165 // We also handle the reversed case because this utility may get called
9166 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9167 // arrange things in the right direction.
9168 NewMask[0] -= 4;
9169 NewMask[1] -= 4;
9170 HighV = V1;
9171 LowV = V2;
9172 } else {
9173 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
9174 // trying to place elements directly, just blend them and set up the final
9175 // shuffle to place them.
9176
9177 // The first two blend mask elements are for V1, the second two are for
9178 // V2.
9179 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
9180 Mask[2] < 4 ? Mask[2] : Mask[3],
9181 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
9182 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
9183 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
9184 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9185
9186 // Now we do a normal shuffle of V1 by giving V1 as both operands to
9187 // a blend.
9188 LowV = HighV = V1;
9189 NewMask[0] = Mask[0] < 4 ? 0 : 2;
9190 NewMask[1] = Mask[0] < 4 ? 2 : 0;
9191 NewMask[2] = Mask[2] < 4 ? 1 : 3;
9192 NewMask[3] = Mask[2] < 4 ? 3 : 1;
9193 }
9194 }
9195 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
9196 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
9197 }
9198
9199 /// \brief Lower 4-lane 32-bit floating point shuffles.
9200 ///
9201 /// Uses instructions exclusively from the floating point unit to minimize
9202 /// domain crossing penalties, as these are sufficient to implement all v4f32
9203 /// shuffles.
lowerV4F32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)9204 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9205 SDValue V1, SDValue V2,
9206 const X86Subtarget &Subtarget,
9207 SelectionDAG &DAG) {
9208 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9209 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9210 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9211
9212 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9213
9214 if (NumV2Elements == 0) {
9215 // Check for being able to broadcast a single element.
9216 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9217 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
9218 return Broadcast;
9219
9220 // Use even/odd duplicate instructions for masks that match their pattern.
9221 if (Subtarget.hasSSE3()) {
9222 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
9223 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
9224 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
9225 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
9226 }
9227
9228 if (Subtarget.hasAVX()) {
9229 // If we have AVX, we can use VPERMILPS which will allow folding a load
9230 // into the shuffle.
9231 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
9232 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9233 }
9234
9235 // Otherwise, use a straight shuffle of a single input vector. We pass the
9236 // input vector to both operands to simulate this with a SHUFPS.
9237 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
9238 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9239 }
9240
9241 // There are special ways we can lower some single-element blends. However, we
9242 // have custom ways we can lower more complex single-element blends below that
9243 // we defer to if both this and BLENDPS fail to match, so restrict this to
9244 // when the V2 input is targeting element 0 of the mask -- that is the fast
9245 // case here.
9246 if (NumV2Elements == 1 && Mask[0] >= 4)
9247 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
9248 Mask, Subtarget, DAG))
9249 return V;
9250
9251 if (Subtarget.hasSSE41()) {
9252 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
9253 Subtarget, DAG))
9254 return Blend;
9255
9256 // Use INSERTPS if we can complete the shuffle efficiently.
9257 if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
9258 return V;
9259
9260 if (!isSingleSHUFPSMask(Mask))
9261 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
9262 DL, MVT::v4f32, V1, V2, Mask, DAG))
9263 return BlendPerm;
9264 }
9265
9266 // Use low/high mov instructions.
9267 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
9268 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
9269 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
9270 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
9271
9272 // Use dedicated unpack instructions for masks that match their pattern.
9273 if (SDValue V =
9274 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
9275 return V;
9276
9277 // Otherwise fall back to a SHUFPS lowering strategy.
9278 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
9279 }
9280
9281 /// \brief Lower 4-lane i32 vector shuffles.
9282 ///
9283 /// We try to handle these with integer-domain shuffles where we can, but for
9284 /// blends we use the floating point domain blend instructions.
lowerV4I32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)9285 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9286 SDValue V1, SDValue V2,
9287 const X86Subtarget &Subtarget,
9288 SelectionDAG &DAG) {
9289 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9290 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9291 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9292
9293 // Whenever we can lower this as a zext, that instruction is strictly faster
9294 // than any alternative. It also allows us to fold memory operands into the
9295 // shuffle in many cases.
9296 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
9297 Mask, Subtarget, DAG))
9298 return ZExt;
9299
9300 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9301
9302 if (NumV2Elements == 0) {
9303 // Check for being able to broadcast a single element.
9304 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9305 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9306 return Broadcast;
9307
9308 // Straight shuffle of a single input vector. For everything from SSE2
9309 // onward this has a single fast instruction with no scary immediates.
9310 // We coerce the shuffle pattern to be compatible with UNPCK instructions
9311 // but we aren't actually going to use the UNPCK instruction because doing
9312 // so prevents folding a load into this instruction or making a copy.
9313 const int UnpackLoMask[] = {0, 0, 1, 1};
9314 const int UnpackHiMask[] = {2, 2, 3, 3};
9315 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
9316 Mask = UnpackLoMask;
9317 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
9318 Mask = UnpackHiMask;
9319
9320 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9321 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9322 }
9323
9324 // Try to use shift instructions.
9325 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
9326 Subtarget, DAG))
9327 return Shift;
9328
9329 // There are special ways we can lower some single-element blends.
9330 if (NumV2Elements == 1)
9331 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
9332 Mask, Subtarget, DAG))
9333 return V;
9334
9335 // We have different paths for blend lowering, but they all must use the
9336 // *exact* same predicate.
9337 bool IsBlendSupported = Subtarget.hasSSE41();
9338 if (IsBlendSupported)
9339 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
9340 Subtarget, DAG))
9341 return Blend;
9342
9343 if (SDValue Masked =
9344 lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
9345 return Masked;
9346
9347 // Use dedicated unpack instructions for masks that match their pattern.
9348 if (SDValue V =
9349 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
9350 return V;
9351
9352 // Try to use byte rotation instructions.
9353 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9354 if (Subtarget.hasSSSE3())
9355 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9356 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9357 return Rotate;
9358
9359 // If we have direct support for blends, we should lower by decomposing into
9360 // a permute. That will be faster than the domain cross.
9361 if (IsBlendSupported)
9362 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
9363 Mask, DAG);
9364
9365 // Try to lower by permuting the inputs into an unpack instruction.
9366 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
9367 V2, Mask, DAG))
9368 return Unpack;
9369
9370 // We implement this with SHUFPS because it can blend from two vectors.
9371 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
9372 // up the inputs, bypassing domain shift penalties that we would encur if we
9373 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
9374 // relevant.
9375 return DAG.getBitcast(
9376 MVT::v4i32,
9377 DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
9378 DAG.getBitcast(MVT::v4f32, V2), Mask));
9379 }
9380
9381 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
9382 /// shuffle lowering, and the most complex part.
9383 ///
9384 /// The lowering strategy is to try to form pairs of input lanes which are
9385 /// targeted at the same half of the final vector, and then use a dword shuffle
9386 /// to place them onto the right half, and finally unpack the paired lanes into
9387 /// their final position.
9388 ///
9389 /// The exact breakdown of how to form these dword pairs and align them on the
9390 /// correct sides is really tricky. See the comments within the function for
9391 /// more of the details.
9392 ///
9393 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
9394 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
9395 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
9396 /// vector, form the analogous 128-bit 8-element Mask.
lowerV8I16GeneralSingleInputVectorShuffle(const SDLoc & DL,MVT VT,SDValue V,MutableArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)9397 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
9398 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
9399 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9400 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
9401 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
9402
9403 assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
9404 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
9405 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
9406
9407 SmallVector<int, 4> LoInputs;
9408 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
9409 [](int M) { return M >= 0; });
9410 std::sort(LoInputs.begin(), LoInputs.end());
9411 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
9412 SmallVector<int, 4> HiInputs;
9413 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
9414 [](int M) { return M >= 0; });
9415 std::sort(HiInputs.begin(), HiInputs.end());
9416 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
9417 int NumLToL =
9418 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
9419 int NumHToL = LoInputs.size() - NumLToL;
9420 int NumLToH =
9421 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
9422 int NumHToH = HiInputs.size() - NumLToH;
9423 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
9424 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
9425 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
9426 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
9427
9428 // If we are splatting two values from one half - one to each half, then
9429 // we can shuffle that half so each is splatted to a dword, then splat those
9430 // to their respective halves.
9431 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
9432 int DOffset) {
9433 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
9434 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
9435 V = DAG.getNode(ShufWOp, DL, VT, V,
9436 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9437 V = DAG.getBitcast(PSHUFDVT, V);
9438 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
9439 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9440 return DAG.getBitcast(VT, V);
9441 };
9442
9443 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
9444 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
9445 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
9446 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
9447
9448 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
9449 // such inputs we can swap two of the dwords across the half mark and end up
9450 // with <=2 inputs to each half in each half. Once there, we can fall through
9451 // to the generic code below. For example:
9452 //
9453 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9454 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
9455 //
9456 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
9457 // and an existing 2-into-2 on the other half. In this case we may have to
9458 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
9459 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
9460 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
9461 // because any other situation (including a 3-into-1 or 1-into-3 in the other
9462 // half than the one we target for fixing) will be fixed when we re-enter this
9463 // path. We will also combine away any sequence of PSHUFD instructions that
9464 // result into a single instruction. Here is an example of the tricky case:
9465 //
9466 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9467 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
9468 //
9469 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
9470 //
9471 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
9472 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
9473 //
9474 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
9475 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
9476 //
9477 // The result is fine to be handled by the generic logic.
9478 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
9479 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
9480 int AOffset, int BOffset) {
9481 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
9482 "Must call this with A having 3 or 1 inputs from the A half.");
9483 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
9484 "Must call this with B having 1 or 3 inputs from the B half.");
9485 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
9486 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
9487
9488 bool ThreeAInputs = AToAInputs.size() == 3;
9489
9490 // Compute the index of dword with only one word among the three inputs in
9491 // a half by taking the sum of the half with three inputs and subtracting
9492 // the sum of the actual three inputs. The difference is the remaining
9493 // slot.
9494 int ADWord, BDWord;
9495 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
9496 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
9497 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
9498 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
9499 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
9500 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
9501 int TripleNonInputIdx =
9502 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
9503 TripleDWord = TripleNonInputIdx / 2;
9504
9505 // We use xor with one to compute the adjacent DWord to whichever one the
9506 // OneInput is in.
9507 OneInputDWord = (OneInput / 2) ^ 1;
9508
9509 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
9510 // and BToA inputs. If there is also such a problem with the BToB and AToB
9511 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
9512 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
9513 // is essential that we don't *create* a 3<-1 as then we might oscillate.
9514 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
9515 // Compute how many inputs will be flipped by swapping these DWords. We
9516 // need
9517 // to balance this to ensure we don't form a 3-1 shuffle in the other
9518 // half.
9519 int NumFlippedAToBInputs =
9520 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
9521 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
9522 int NumFlippedBToBInputs =
9523 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
9524 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
9525 if ((NumFlippedAToBInputs == 1 &&
9526 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
9527 (NumFlippedBToBInputs == 1 &&
9528 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
9529 // We choose whether to fix the A half or B half based on whether that
9530 // half has zero flipped inputs. At zero, we may not be able to fix it
9531 // with that half. We also bias towards fixing the B half because that
9532 // will more commonly be the high half, and we have to bias one way.
9533 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
9534 ArrayRef<int> Inputs) {
9535 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
9536 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
9537 PinnedIdx ^ 1) != Inputs.end();
9538 // Determine whether the free index is in the flipped dword or the
9539 // unflipped dword based on where the pinned index is. We use this bit
9540 // in an xor to conditionally select the adjacent dword.
9541 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
9542 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9543 FixFreeIdx) != Inputs.end();
9544 if (IsFixIdxInput == IsFixFreeIdxInput)
9545 FixFreeIdx += 1;
9546 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9547 FixFreeIdx) != Inputs.end();
9548 assert(IsFixIdxInput != IsFixFreeIdxInput &&
9549 "We need to be changing the number of flipped inputs!");
9550 int PSHUFHalfMask[] = {0, 1, 2, 3};
9551 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
9552 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
9553 MVT::v8i16, V,
9554 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9555
9556 for (int &M : Mask)
9557 if (M >= 0 && M == FixIdx)
9558 M = FixFreeIdx;
9559 else if (M >= 0 && M == FixFreeIdx)
9560 M = FixIdx;
9561 };
9562 if (NumFlippedBToBInputs != 0) {
9563 int BPinnedIdx =
9564 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9565 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9566 } else {
9567 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9568 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
9569 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9570 }
9571 }
9572 }
9573
9574 int PSHUFDMask[] = {0, 1, 2, 3};
9575 PSHUFDMask[ADWord] = BDWord;
9576 PSHUFDMask[BDWord] = ADWord;
9577 V = DAG.getBitcast(
9578 VT,
9579 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9580 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9581
9582 // Adjust the mask to match the new locations of A and B.
9583 for (int &M : Mask)
9584 if (M >= 0 && M/2 == ADWord)
9585 M = 2 * BDWord + M % 2;
9586 else if (M >= 0 && M/2 == BDWord)
9587 M = 2 * ADWord + M % 2;
9588
9589 // Recurse back into this routine to re-compute state now that this isn't
9590 // a 3 and 1 problem.
9591 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
9592 DAG);
9593 };
9594 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9595 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9596 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9597 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9598
9599 // At this point there are at most two inputs to the low and high halves from
9600 // each half. That means the inputs can always be grouped into dwords and
9601 // those dwords can then be moved to the correct half with a dword shuffle.
9602 // We use at most one low and one high word shuffle to collect these paired
9603 // inputs into dwords, and finally a dword shuffle to place them.
9604 int PSHUFLMask[4] = {-1, -1, -1, -1};
9605 int PSHUFHMask[4] = {-1, -1, -1, -1};
9606 int PSHUFDMask[4] = {-1, -1, -1, -1};
9607
9608 // First fix the masks for all the inputs that are staying in their
9609 // original halves. This will then dictate the targets of the cross-half
9610 // shuffles.
9611 auto fixInPlaceInputs =
9612 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9613 MutableArrayRef<int> SourceHalfMask,
9614 MutableArrayRef<int> HalfMask, int HalfOffset) {
9615 if (InPlaceInputs.empty())
9616 return;
9617 if (InPlaceInputs.size() == 1) {
9618 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9619 InPlaceInputs[0] - HalfOffset;
9620 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9621 return;
9622 }
9623 if (IncomingInputs.empty()) {
9624 // Just fix all of the in place inputs.
9625 for (int Input : InPlaceInputs) {
9626 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9627 PSHUFDMask[Input / 2] = Input / 2;
9628 }
9629 return;
9630 }
9631
9632 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9633 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9634 InPlaceInputs[0] - HalfOffset;
9635 // Put the second input next to the first so that they are packed into
9636 // a dword. We find the adjacent index by toggling the low bit.
9637 int AdjIndex = InPlaceInputs[0] ^ 1;
9638 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9639 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9640 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9641 };
9642 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9643 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9644
9645 // Now gather the cross-half inputs and place them into a free dword of
9646 // their target half.
9647 // FIXME: This operation could almost certainly be simplified dramatically to
9648 // look more like the 3-1 fixing operation.
9649 auto moveInputsToRightHalf = [&PSHUFDMask](
9650 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9651 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9652 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9653 int DestOffset) {
9654 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9655 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
9656 };
9657 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9658 int Word) {
9659 int LowWord = Word & ~1;
9660 int HighWord = Word | 1;
9661 return isWordClobbered(SourceHalfMask, LowWord) ||
9662 isWordClobbered(SourceHalfMask, HighWord);
9663 };
9664
9665 if (IncomingInputs.empty())
9666 return;
9667
9668 if (ExistingInputs.empty()) {
9669 // Map any dwords with inputs from them into the right half.
9670 for (int Input : IncomingInputs) {
9671 // If the source half mask maps over the inputs, turn those into
9672 // swaps and use the swapped lane.
9673 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9674 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
9675 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9676 Input - SourceOffset;
9677 // We have to swap the uses in our half mask in one sweep.
9678 for (int &M : HalfMask)
9679 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9680 M = Input;
9681 else if (M == Input)
9682 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9683 } else {
9684 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9685 Input - SourceOffset &&
9686 "Previous placement doesn't match!");
9687 }
9688 // Note that this correctly re-maps both when we do a swap and when
9689 // we observe the other side of the swap above. We rely on that to
9690 // avoid swapping the members of the input list directly.
9691 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9692 }
9693
9694 // Map the input's dword into the correct half.
9695 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
9696 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9697 else
9698 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9699 Input / 2 &&
9700 "Previous placement doesn't match!");
9701 }
9702
9703 // And just directly shift any other-half mask elements to be same-half
9704 // as we will have mirrored the dword containing the element into the
9705 // same position within that half.
9706 for (int &M : HalfMask)
9707 if (M >= SourceOffset && M < SourceOffset + 4) {
9708 M = M - SourceOffset + DestOffset;
9709 assert(M >= 0 && "This should never wrap below zero!");
9710 }
9711 return;
9712 }
9713
9714 // Ensure we have the input in a viable dword of its current half. This
9715 // is particularly tricky because the original position may be clobbered
9716 // by inputs being moved and *staying* in that half.
9717 if (IncomingInputs.size() == 1) {
9718 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9719 int InputFixed = std::find(std::begin(SourceHalfMask),
9720 std::end(SourceHalfMask), -1) -
9721 std::begin(SourceHalfMask) + SourceOffset;
9722 SourceHalfMask[InputFixed - SourceOffset] =
9723 IncomingInputs[0] - SourceOffset;
9724 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9725 InputFixed);
9726 IncomingInputs[0] = InputFixed;
9727 }
9728 } else if (IncomingInputs.size() == 2) {
9729 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9730 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9731 // We have two non-adjacent or clobbered inputs we need to extract from
9732 // the source half. To do this, we need to map them into some adjacent
9733 // dword slot in the source mask.
9734 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9735 IncomingInputs[1] - SourceOffset};
9736
9737 // If there is a free slot in the source half mask adjacent to one of
9738 // the inputs, place the other input in it. We use (Index XOR 1) to
9739 // compute an adjacent index.
9740 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9741 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
9742 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9743 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9744 InputsFixed[1] = InputsFixed[0] ^ 1;
9745 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9746 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
9747 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9748 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9749 InputsFixed[0] = InputsFixed[1] ^ 1;
9750 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
9751 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
9752 // The two inputs are in the same DWord but it is clobbered and the
9753 // adjacent DWord isn't used at all. Move both inputs to the free
9754 // slot.
9755 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9756 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9757 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9758 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9759 } else {
9760 // The only way we hit this point is if there is no clobbering
9761 // (because there are no off-half inputs to this half) and there is no
9762 // free slot adjacent to one of the inputs. In this case, we have to
9763 // swap an input with a non-input.
9764 for (int i = 0; i < 4; ++i)
9765 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
9766 "We can't handle any clobbers here!");
9767 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9768 "Cannot have adjacent inputs here!");
9769
9770 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9771 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9772
9773 // We also have to update the final source mask in this case because
9774 // it may need to undo the above swap.
9775 for (int &M : FinalSourceHalfMask)
9776 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9777 M = InputsFixed[1] + SourceOffset;
9778 else if (M == InputsFixed[1] + SourceOffset)
9779 M = (InputsFixed[0] ^ 1) + SourceOffset;
9780
9781 InputsFixed[1] = InputsFixed[0] ^ 1;
9782 }
9783
9784 // Point everything at the fixed inputs.
9785 for (int &M : HalfMask)
9786 if (M == IncomingInputs[0])
9787 M = InputsFixed[0] + SourceOffset;
9788 else if (M == IncomingInputs[1])
9789 M = InputsFixed[1] + SourceOffset;
9790
9791 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9792 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9793 }
9794 } else {
9795 llvm_unreachable("Unhandled input size!");
9796 }
9797
9798 // Now hoist the DWord down to the right half.
9799 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
9800 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
9801 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9802 for (int &M : HalfMask)
9803 for (int Input : IncomingInputs)
9804 if (M == Input)
9805 M = FreeDWord * 2 + Input % 2;
9806 };
9807 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9808 /*SourceOffset*/ 4, /*DestOffset*/ 0);
9809 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9810 /*SourceOffset*/ 0, /*DestOffset*/ 4);
9811
9812 // Now enact all the shuffles we've computed to move the inputs into their
9813 // target half.
9814 if (!isNoopShuffleMask(PSHUFLMask))
9815 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9816 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
9817 if (!isNoopShuffleMask(PSHUFHMask))
9818 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9819 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
9820 if (!isNoopShuffleMask(PSHUFDMask))
9821 V = DAG.getBitcast(
9822 VT,
9823 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9824 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9825
9826 // At this point, each half should contain all its inputs, and we can then
9827 // just shuffle them into their final position.
9828 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
9829 "Failed to lift all the high half inputs to the low mask!");
9830 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
9831 "Failed to lift all the low half inputs to the high mask!");
9832
9833 // Do a half shuffle for the low mask.
9834 if (!isNoopShuffleMask(LoMask))
9835 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9836 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
9837
9838 // Do a half shuffle with the high mask after shifting its values down.
9839 for (int &M : HiMask)
9840 if (M >= 0)
9841 M -= 4;
9842 if (!isNoopShuffleMask(HiMask))
9843 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9844 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
9845
9846 return V;
9847 }
9848
9849 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
9850 /// blend if only one input is used.
lowerVectorShuffleAsBlendOfPSHUFBs(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,bool & V1InUse,bool & V2InUse)9851 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
9852 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9853 SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
9854 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9855 SDValue V1Mask[16];
9856 SDValue V2Mask[16];
9857 V1InUse = false;
9858 V2InUse = false;
9859
9860 int Size = Mask.size();
9861 int Scale = 16 / Size;
9862 for (int i = 0; i < 16; ++i) {
9863 if (Mask[i / Scale] < 0) {
9864 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9865 } else {
9866 const int ZeroMask = 0x80;
9867 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
9868 : ZeroMask;
9869 int V2Idx = Mask[i / Scale] < Size
9870 ? ZeroMask
9871 : (Mask[i / Scale] - Size) * Scale + i % Scale;
9872 if (Zeroable[i / Scale])
9873 V1Idx = V2Idx = ZeroMask;
9874 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
9875 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
9876 V1InUse |= (ZeroMask != V1Idx);
9877 V2InUse |= (ZeroMask != V2Idx);
9878 }
9879 }
9880
9881 if (V1InUse)
9882 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9883 DAG.getBitcast(MVT::v16i8, V1),
9884 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
9885 if (V2InUse)
9886 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9887 DAG.getBitcast(MVT::v16i8, V2),
9888 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
9889
9890 // If we need shuffled inputs from both, blend the two.
9891 SDValue V;
9892 if (V1InUse && V2InUse)
9893 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9894 else
9895 V = V1InUse ? V1 : V2;
9896
9897 // Cast the result back to the correct type.
9898 return DAG.getBitcast(VT, V);
9899 }
9900
9901 /// \brief Generic lowering of 8-lane i16 shuffles.
9902 ///
9903 /// This handles both single-input shuffles and combined shuffle/blends with
9904 /// two inputs. The single input shuffles are immediately delegated to
9905 /// a dedicated lowering routine.
9906 ///
9907 /// The blends are lowered in one of three fundamental ways. If there are few
9908 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9909 /// of the input is significantly cheaper when lowered as an interleaving of
9910 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9911 /// halves of the inputs separately (making them have relatively few inputs)
9912 /// and then concatenate them.
lowerV8I16VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)9913 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9914 SDValue V1, SDValue V2,
9915 const X86Subtarget &Subtarget,
9916 SelectionDAG &DAG) {
9917 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9918 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9919 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9920
9921 // Whenever we can lower this as a zext, that instruction is strictly faster
9922 // than any alternative.
9923 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9924 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9925 return ZExt;
9926
9927 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
9928
9929 if (NumV2Inputs == 0) {
9930 // Check for being able to broadcast a single element.
9931 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9932 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9933 return Broadcast;
9934
9935 // Try to use shift instructions.
9936 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
9937 Subtarget, DAG))
9938 return Shift;
9939
9940 // Use dedicated unpack instructions for masks that match their pattern.
9941 if (SDValue V =
9942 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9943 return V;
9944
9945 // Try to use byte rotation instructions.
9946 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
9947 Mask, Subtarget, DAG))
9948 return Rotate;
9949
9950 // Make a copy of the mask so it can be modified.
9951 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
9952 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
9953 MutableMask, Subtarget,
9954 DAG);
9955 }
9956
9957 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
9958 "All single-input shuffles should be canonicalized to be V1-input "
9959 "shuffles.");
9960
9961 // Try to use shift instructions.
9962 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
9963 Subtarget, DAG))
9964 return Shift;
9965
9966 // See if we can use SSE4A Extraction / Insertion.
9967 if (Subtarget.hasSSE4A())
9968 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
9969 return V;
9970
9971 // There are special ways we can lower some single-element blends.
9972 if (NumV2Inputs == 1)
9973 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
9974 Mask, Subtarget, DAG))
9975 return V;
9976
9977 // We have different paths for blend lowering, but they all must use the
9978 // *exact* same predicate.
9979 bool IsBlendSupported = Subtarget.hasSSE41();
9980 if (IsBlendSupported)
9981 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9982 Subtarget, DAG))
9983 return Blend;
9984
9985 if (SDValue Masked =
9986 lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9987 return Masked;
9988
9989 // Use dedicated unpack instructions for masks that match their pattern.
9990 if (SDValue V =
9991 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9992 return V;
9993
9994 // Try to use byte rotation instructions.
9995 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9996 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9997 return Rotate;
9998
9999 if (SDValue BitBlend =
10000 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10001 return BitBlend;
10002
10003 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10004 V2, Mask, DAG))
10005 return Unpack;
10006
10007 // If we can't directly blend but can use PSHUFB, that will be better as it
10008 // can both shuffle and set up the inefficient blend.
10009 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10010 bool V1InUse, V2InUse;
10011 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
10012 V1InUse, V2InUse);
10013 }
10014
10015 // We can always bit-blend if we have to so the fallback strategy is to
10016 // decompose into single-input permutes and blends.
10017 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10018 Mask, DAG);
10019 }
10020
10021 /// \brief Check whether a compaction lowering can be done by dropping even
10022 /// elements and compute how many times even elements must be dropped.
10023 ///
10024 /// This handles shuffles which take every Nth element where N is a power of
10025 /// two. Example shuffle masks:
10026 ///
10027 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10028 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10029 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10030 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10031 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10032 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10033 ///
10034 /// Any of these lanes can of course be undef.
10035 ///
10036 /// This routine only supports N <= 3.
10037 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10038 /// for larger N.
10039 ///
10040 /// \returns N above, or the number of times even elements must be dropped if
10041 /// there is such a number. Otherwise returns zero.
canLowerByDroppingEvenElements(ArrayRef<int> Mask,bool IsSingleInput)10042 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10043 bool IsSingleInput) {
10044 // The modulus for the shuffle vector entries is based on whether this is
10045 // a single input or not.
10046 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10047 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10048 "We should only be called with masks with a power-of-2 size!");
10049
10050 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10051
10052 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10053 // and 2^3 simultaneously. This is because we may have ambiguity with
10054 // partially undef inputs.
10055 bool ViableForN[3] = {true, true, true};
10056
10057 for (int i = 0, e = Mask.size(); i < e; ++i) {
10058 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10059 // want.
10060 if (Mask[i] < 0)
10061 continue;
10062
10063 bool IsAnyViable = false;
10064 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10065 if (ViableForN[j]) {
10066 uint64_t N = j + 1;
10067
10068 // The shuffle mask must be equal to (i * 2^N) % M.
10069 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10070 IsAnyViable = true;
10071 else
10072 ViableForN[j] = false;
10073 }
10074 // Early exit if we exhaust the possible powers of two.
10075 if (!IsAnyViable)
10076 break;
10077 }
10078
10079 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10080 if (ViableForN[j])
10081 return j + 1;
10082
10083 // Return 0 as there is no viable power of two.
10084 return 0;
10085 }
10086
10087 /// \brief Generic lowering of v16i8 shuffles.
10088 ///
10089 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10090 /// detect any complexity reducing interleaving. If that doesn't help, it uses
10091 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10092 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10093 /// back together.
lowerV16I8VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)10094 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10095 SDValue V1, SDValue V2,
10096 const X86Subtarget &Subtarget,
10097 SelectionDAG &DAG) {
10098 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10099 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10100 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10101
10102 // Try to use shift instructions.
10103 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10104 Subtarget, DAG))
10105 return Shift;
10106
10107 // Try to use byte rotation instructions.
10108 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10109 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10110 return Rotate;
10111
10112 // Try to use a zext lowering.
10113 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10114 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10115 return ZExt;
10116
10117 // See if we can use SSE4A Extraction / Insertion.
10118 if (Subtarget.hasSSE4A())
10119 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
10120 return V;
10121
10122 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10123
10124 // For single-input shuffles, there are some nicer lowering tricks we can use.
10125 if (NumV2Elements == 0) {
10126 // Check for being able to broadcast a single element.
10127 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10128 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10129 return Broadcast;
10130
10131 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10132 // Notably, this handles splat and partial-splat shuffles more efficiently.
10133 // However, it only makes sense if the pre-duplication shuffle simplifies
10134 // things significantly. Currently, this means we need to be able to
10135 // express the pre-duplication shuffle as an i16 shuffle.
10136 //
10137 // FIXME: We should check for other patterns which can be widened into an
10138 // i16 shuffle as well.
10139 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10140 for (int i = 0; i < 16; i += 2)
10141 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10142 return false;
10143
10144 return true;
10145 };
10146 auto tryToWidenViaDuplication = [&]() -> SDValue {
10147 if (!canWidenViaDuplication(Mask))
10148 return SDValue();
10149 SmallVector<int, 4> LoInputs;
10150 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10151 [](int M) { return M >= 0 && M < 8; });
10152 std::sort(LoInputs.begin(), LoInputs.end());
10153 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10154 LoInputs.end());
10155 SmallVector<int, 4> HiInputs;
10156 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10157 [](int M) { return M >= 8; });
10158 std::sort(HiInputs.begin(), HiInputs.end());
10159 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10160 HiInputs.end());
10161
10162 bool TargetLo = LoInputs.size() >= HiInputs.size();
10163 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10164 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
10165
10166 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
10167 SmallDenseMap<int, int, 8> LaneMap;
10168 for (int I : InPlaceInputs) {
10169 PreDupI16Shuffle[I/2] = I/2;
10170 LaneMap[I] = I;
10171 }
10172 int j = TargetLo ? 0 : 4, je = j + 4;
10173 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
10174 // Check if j is already a shuffle of this input. This happens when
10175 // there are two adjacent bytes after we move the low one.
10176 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
10177 // If we haven't yet mapped the input, search for a slot into which
10178 // we can map it.
10179 while (j < je && PreDupI16Shuffle[j] >= 0)
10180 ++j;
10181
10182 if (j == je)
10183 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
10184 return SDValue();
10185
10186 // Map this input with the i16 shuffle.
10187 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
10188 }
10189
10190 // Update the lane map based on the mapping we ended up with.
10191 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
10192 }
10193 V1 = DAG.getBitcast(
10194 MVT::v16i8,
10195 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10196 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
10197
10198 // Unpack the bytes to form the i16s that will be shuffled into place.
10199 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10200 MVT::v16i8, V1, V1);
10201
10202 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10203 for (int i = 0; i < 16; ++i)
10204 if (Mask[i] >= 0) {
10205 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
10206 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
10207 if (PostDupI16Shuffle[i / 2] < 0)
10208 PostDupI16Shuffle[i / 2] = MappedMask;
10209 else
10210 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
10211 "Conflicting entrties in the original shuffle!");
10212 }
10213 return DAG.getBitcast(
10214 MVT::v16i8,
10215 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10216 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
10217 };
10218 if (SDValue V = tryToWidenViaDuplication())
10219 return V;
10220 }
10221
10222 if (SDValue Masked =
10223 lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
10224 return Masked;
10225
10226 // Use dedicated unpack instructions for masks that match their pattern.
10227 if (SDValue V =
10228 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
10229 return V;
10230
10231 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
10232 // with PSHUFB. It is important to do this before we attempt to generate any
10233 // blends but after all of the single-input lowerings. If the single input
10234 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
10235 // want to preserve that and we can DAG combine any longer sequences into
10236 // a PSHUFB in the end. But once we start blending from multiple inputs,
10237 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
10238 // and there are *very* few patterns that would actually be faster than the
10239 // PSHUFB approach because of its ability to zero lanes.
10240 //
10241 // FIXME: The only exceptions to the above are blends which are exact
10242 // interleavings with direct instructions supporting them. We currently don't
10243 // handle those well here.
10244 if (Subtarget.hasSSSE3()) {
10245 bool V1InUse = false;
10246 bool V2InUse = false;
10247
10248 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
10249 DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
10250
10251 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
10252 // do so. This avoids using them to handle blends-with-zero which is
10253 // important as a single pshufb is significantly faster for that.
10254 if (V1InUse && V2InUse) {
10255 if (Subtarget.hasSSE41())
10256 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
10257 Mask, Subtarget, DAG))
10258 return Blend;
10259
10260 // We can use an unpack to do the blending rather than an or in some
10261 // cases. Even though the or may be (very minorly) more efficient, we
10262 // preference this lowering because there are common cases where part of
10263 // the complexity of the shuffles goes away when we do the final blend as
10264 // an unpack.
10265 // FIXME: It might be worth trying to detect if the unpack-feeding
10266 // shuffles will both be pshufb, in which case we shouldn't bother with
10267 // this.
10268 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10269 DL, MVT::v16i8, V1, V2, Mask, DAG))
10270 return Unpack;
10271 }
10272
10273 return PSHUFB;
10274 }
10275
10276 // There are special ways we can lower some single-element blends.
10277 if (NumV2Elements == 1)
10278 if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
10279 Mask, Subtarget, DAG))
10280 return V;
10281
10282 if (SDValue BitBlend =
10283 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
10284 return BitBlend;
10285
10286 // Check whether a compaction lowering can be done. This handles shuffles
10287 // which take every Nth element for some even N. See the helper function for
10288 // details.
10289 //
10290 // We special case these as they can be particularly efficiently handled with
10291 // the PACKUSB instruction on x86 and they show up in common patterns of
10292 // rearranging bytes to truncate wide elements.
10293 bool IsSingleInput = V2.isUndef();
10294 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
10295 // NumEvenDrops is the power of two stride of the elements. Another way of
10296 // thinking about it is that we need to drop the even elements this many
10297 // times to get the original input.
10298
10299 // First we need to zero all the dropped bytes.
10300 assert(NumEvenDrops <= 3 &&
10301 "No support for dropping even elements more than 3 times.");
10302 // We use the mask type to pick which bytes are preserved based on how many
10303 // elements are dropped.
10304 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
10305 SDValue ByteClearMask = DAG.getBitcast(
10306 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
10307 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
10308 if (!IsSingleInput)
10309 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
10310
10311 // Now pack things back together.
10312 V1 = DAG.getBitcast(MVT::v8i16, V1);
10313 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
10314 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
10315 for (int i = 1; i < NumEvenDrops; ++i) {
10316 Result = DAG.getBitcast(MVT::v8i16, Result);
10317 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
10318 }
10319
10320 return Result;
10321 }
10322
10323 // Handle multi-input cases by blending single-input shuffles.
10324 if (NumV2Elements > 0)
10325 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
10326 Mask, DAG);
10327
10328 // The fallback path for single-input shuffles widens this into two v8i16
10329 // vectors with unpacks, shuffles those, and then pulls them back together
10330 // with a pack.
10331 SDValue V = V1;
10332
10333 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10334 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10335 for (int i = 0; i < 16; ++i)
10336 if (Mask[i] >= 0)
10337 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
10338
10339 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
10340
10341 SDValue VLoHalf, VHiHalf;
10342 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
10343 // them out and avoid using UNPCK{L,H} to extract the elements of V as
10344 // i16s.
10345 if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
10346 [](int M) { return M >= 0 && M % 2 == 1; }) &&
10347 std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
10348 [](int M) { return M >= 0 && M % 2 == 1; })) {
10349 // Use a mask to drop the high bytes.
10350 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
10351 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
10352 DAG.getConstant(0x00FF, DL, MVT::v8i16));
10353
10354 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
10355 VHiHalf = DAG.getUNDEF(MVT::v8i16);
10356
10357 // Squash the masks to point directly into VLoHalf.
10358 for (int &M : LoBlendMask)
10359 if (M >= 0)
10360 M /= 2;
10361 for (int &M : HiBlendMask)
10362 if (M >= 0)
10363 M /= 2;
10364 } else {
10365 // Otherwise just unpack the low half of V into VLoHalf and the high half into
10366 // VHiHalf so that we can blend them as i16s.
10367 VLoHalf = DAG.getBitcast(
10368 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
10369 VHiHalf = DAG.getBitcast(
10370 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
10371 }
10372
10373 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
10374 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
10375
10376 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
10377 }
10378
10379 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
10380 ///
10381 /// This routine breaks down the specific type of 128-bit shuffle and
10382 /// dispatches to the lowering routines accordingly.
lower128BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)10383 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10384 MVT VT, SDValue V1, SDValue V2,
10385 const X86Subtarget &Subtarget,
10386 SelectionDAG &DAG) {
10387 switch (VT.SimpleTy) {
10388 case MVT::v2i64:
10389 return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10390 case MVT::v2f64:
10391 return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10392 case MVT::v4i32:
10393 return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10394 case MVT::v4f32:
10395 return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10396 case MVT::v8i16:
10397 return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10398 case MVT::v16i8:
10399 return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10400
10401 default:
10402 llvm_unreachable("Unimplemented!");
10403 }
10404 }
10405
10406 /// \brief Helper function to test whether a shuffle mask could be
10407 /// simplified by widening the elements being shuffled.
10408 ///
10409 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10410 /// leaves it in an unspecified state.
10411 ///
10412 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10413 /// shuffle masks. The latter have the special property of a '-2' representing
10414 /// a zero-ed lane of a vector.
canWidenShuffleElements(ArrayRef<int> Mask,SmallVectorImpl<int> & WidenedMask)10415 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10416 SmallVectorImpl<int> &WidenedMask) {
10417 WidenedMask.assign(Mask.size() / 2, 0);
10418 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10419 // If both elements are undef, its trivial.
10420 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10421 WidenedMask[i/2] = SM_SentinelUndef;
10422 continue;
10423 }
10424
10425 // Check for an undef mask and a mask value properly aligned to fit with
10426 // a pair of values. If we find such a case, use the non-undef mask's value.
10427 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10428 WidenedMask[i/2] = Mask[i + 1] / 2;
10429 continue;
10430 }
10431 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10432 WidenedMask[i/2] = Mask[i] / 2;
10433 continue;
10434 }
10435
10436 // When zeroing, we need to spread the zeroing across both lanes to widen.
10437 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10438 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10439 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10440 WidenedMask[i/2] = SM_SentinelZero;
10441 continue;
10442 }
10443 return false;
10444 }
10445
10446 // Finally check if the two mask values are adjacent and aligned with
10447 // a pair.
10448 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10449 WidenedMask[i/2] = Mask[i] / 2;
10450 continue;
10451 }
10452
10453 // Otherwise we can't safely widen the elements used in this shuffle.
10454 return false;
10455 }
10456 assert(WidenedMask.size() == Mask.size() / 2 &&
10457 "Incorrect size of mask after widening the elements!");
10458
10459 return true;
10460 }
10461
10462 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
10463 ///
10464 /// This routine just extracts two subvectors, shuffles them independently, and
10465 /// then concatenates them back together. This should work effectively with all
10466 /// AVX vector shuffle types.
splitAndLowerVectorShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)10467 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10468 SDValue V2, ArrayRef<int> Mask,
10469 SelectionDAG &DAG) {
10470 assert(VT.getSizeInBits() >= 256 &&
10471 "Only for 256-bit or wider vector shuffles!");
10472 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10473 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10474
10475 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10476 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10477
10478 int NumElements = VT.getVectorNumElements();
10479 int SplitNumElements = NumElements / 2;
10480 MVT ScalarVT = VT.getVectorElementType();
10481 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10482
10483 // Rather than splitting build-vectors, just build two narrower build
10484 // vectors. This helps shuffling with splats and zeros.
10485 auto SplitVector = [&](SDValue V) {
10486 V = peekThroughBitcasts(V);
10487
10488 MVT OrigVT = V.getSimpleValueType();
10489 int OrigNumElements = OrigVT.getVectorNumElements();
10490 int OrigSplitNumElements = OrigNumElements / 2;
10491 MVT OrigScalarVT = OrigVT.getVectorElementType();
10492 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
10493
10494 SDValue LoV, HiV;
10495
10496 auto *BV = dyn_cast<BuildVectorSDNode>(V);
10497 if (!BV) {
10498 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10499 DAG.getIntPtrConstant(0, DL));
10500 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10501 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
10502 } else {
10503
10504 SmallVector<SDValue, 16> LoOps, HiOps;
10505 for (int i = 0; i < OrigSplitNumElements; ++i) {
10506 LoOps.push_back(BV->getOperand(i));
10507 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
10508 }
10509 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
10510 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
10511 }
10512 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
10513 DAG.getBitcast(SplitVT, HiV));
10514 };
10515
10516 SDValue LoV1, HiV1, LoV2, HiV2;
10517 std::tie(LoV1, HiV1) = SplitVector(V1);
10518 std::tie(LoV2, HiV2) = SplitVector(V2);
10519
10520 // Now create two 4-way blends of these half-width vectors.
10521 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10522 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10523 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
10524 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
10525 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
10526 for (int i = 0; i < SplitNumElements; ++i) {
10527 int M = HalfMask[i];
10528 if (M >= NumElements) {
10529 if (M >= NumElements + SplitNumElements)
10530 UseHiV2 = true;
10531 else
10532 UseLoV2 = true;
10533 V2BlendMask[i] = M - NumElements;
10534 BlendMask[i] = SplitNumElements + i;
10535 } else if (M >= 0) {
10536 if (M >= SplitNumElements)
10537 UseHiV1 = true;
10538 else
10539 UseLoV1 = true;
10540 V1BlendMask[i] = M;
10541 BlendMask[i] = i;
10542 }
10543 }
10544
10545 // Because the lowering happens after all combining takes place, we need to
10546 // manually combine these blend masks as much as possible so that we create
10547 // a minimal number of high-level vector shuffle nodes.
10548
10549 // First try just blending the halves of V1 or V2.
10550 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10551 return DAG.getUNDEF(SplitVT);
10552 if (!UseLoV2 && !UseHiV2)
10553 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10554 if (!UseLoV1 && !UseHiV1)
10555 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10556
10557 SDValue V1Blend, V2Blend;
10558 if (UseLoV1 && UseHiV1) {
10559 V1Blend =
10560 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10561 } else {
10562 // We only use half of V1 so map the usage down into the final blend mask.
10563 V1Blend = UseLoV1 ? LoV1 : HiV1;
10564 for (int i = 0; i < SplitNumElements; ++i)
10565 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10566 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10567 }
10568 if (UseLoV2 && UseHiV2) {
10569 V2Blend =
10570 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10571 } else {
10572 // We only use half of V2 so map the usage down into the final blend mask.
10573 V2Blend = UseLoV2 ? LoV2 : HiV2;
10574 for (int i = 0; i < SplitNumElements; ++i)
10575 if (BlendMask[i] >= SplitNumElements)
10576 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10577 }
10578 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10579 };
10580 SDValue Lo = HalfBlend(LoMask);
10581 SDValue Hi = HalfBlend(HiMask);
10582 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10583 }
10584
10585 /// \brief Either split a vector in halves or decompose the shuffles and the
10586 /// blend.
10587 ///
10588 /// This is provided as a good fallback for many lowerings of non-single-input
10589 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10590 /// between splitting the shuffle into 128-bit components and stitching those
10591 /// back together vs. extracting the single-input shuffles and blending those
10592 /// results.
lowerVectorShuffleAsSplitOrBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)10593 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
10594 SDValue V1, SDValue V2,
10595 ArrayRef<int> Mask,
10596 SelectionDAG &DAG) {
10597 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
10598 "shuffles as it could then recurse on itself.");
10599 int Size = Mask.size();
10600
10601 // If this can be modeled as a broadcast of two elements followed by a blend,
10602 // prefer that lowering. This is especially important because broadcasts can
10603 // often fold with memory operands.
10604 auto DoBothBroadcast = [&] {
10605 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10606 for (int M : Mask)
10607 if (M >= Size) {
10608 if (V2BroadcastIdx < 0)
10609 V2BroadcastIdx = M - Size;
10610 else if (M - Size != V2BroadcastIdx)
10611 return false;
10612 } else if (M >= 0) {
10613 if (V1BroadcastIdx < 0)
10614 V1BroadcastIdx = M;
10615 else if (M != V1BroadcastIdx)
10616 return false;
10617 }
10618 return true;
10619 };
10620 if (DoBothBroadcast())
10621 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10622 DAG);
10623
10624 // If the inputs all stem from a single 128-bit lane of each input, then we
10625 // split them rather than blending because the split will decompose to
10626 // unusually few instructions.
10627 int LaneCount = VT.getSizeInBits() / 128;
10628 int LaneSize = Size / LaneCount;
10629 SmallBitVector LaneInputs[2];
10630 LaneInputs[0].resize(LaneCount, false);
10631 LaneInputs[1].resize(LaneCount, false);
10632 for (int i = 0; i < Size; ++i)
10633 if (Mask[i] >= 0)
10634 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10635 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10636 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10637
10638 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10639 // that the decomposed single-input shuffles don't end up here.
10640 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10641 }
10642
10643 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10644 /// a permutation and blend of those lanes.
10645 ///
10646 /// This essentially blends the out-of-lane inputs to each lane into the lane
10647 /// from a permuted copy of the vector. This lowering strategy results in four
10648 /// instructions in the worst case for a single-input cross lane shuffle which
10649 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10650 /// of. Special cases for each particular shuffle pattern should be handled
10651 /// prior to trying this lowering.
lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)10652 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
10653 SDValue V1, SDValue V2,
10654 ArrayRef<int> Mask,
10655 SelectionDAG &DAG) {
10656 // FIXME: This should probably be generalized for 512-bit vectors as well.
10657 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
10658 int Size = Mask.size();
10659 int LaneSize = Size / 2;
10660
10661 // If there are only inputs from one 128-bit lane, splitting will in fact be
10662 // less expensive. The flags track whether the given lane contains an element
10663 // that crosses to another lane.
10664 bool LaneCrossing[2] = {false, false};
10665 for (int i = 0; i < Size; ++i)
10666 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10667 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10668 if (!LaneCrossing[0] || !LaneCrossing[1])
10669 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10670
10671 assert(V2.isUndef() &&
10672 "This last part of this routine only works on single input shuffles");
10673
10674 SmallVector<int, 32> FlippedBlendMask(Size);
10675 for (int i = 0; i < Size; ++i)
10676 FlippedBlendMask[i] =
10677 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10678 ? Mask[i]
10679 : Mask[i] % LaneSize +
10680 (i / LaneSize) * LaneSize + Size);
10681
10682 // Flip the vector, and blend the results which should now be in-lane. The
10683 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10684 // 5 for the high source. The value 3 selects the high half of source 2 and
10685 // the value 2 selects the low half of source 2. We only use source 2 to
10686 // allow folding it into a memory operand.
10687 unsigned PERMMask = 3 | 2 << 4;
10688 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10689 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
10690 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10691 }
10692
10693 /// \brief Handle lowering 2-lane 128-bit shuffles.
lowerV2X128VectorShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10694 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10695 SDValue V2, ArrayRef<int> Mask,
10696 const X86Subtarget &Subtarget,
10697 SelectionDAG &DAG) {
10698 // TODO: If minimizing size and one of the inputs is a zero vector and the
10699 // the zero vector has only one use, we could use a VPERM2X128 to save the
10700 // instruction bytes needed to explicitly generate the zero vector.
10701
10702 // Blends are faster and handle all the non-lane-crossing cases.
10703 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10704 Subtarget, DAG))
10705 return Blend;
10706
10707 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
10708 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
10709
10710 // If either input operand is a zero vector, use VPERM2X128 because its mask
10711 // allows us to replace the zero input with an implicit zero.
10712 if (!IsV1Zero && !IsV2Zero) {
10713 // Check for patterns which can be matched with a single insert of a 128-bit
10714 // subvector.
10715 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
10716 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
10717 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
10718 if (Subtarget.hasAVX2() && V2.isUndef())
10719 return SDValue();
10720
10721 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10722 VT.getVectorNumElements() / 2);
10723 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10724 DAG.getIntPtrConstant(0, DL));
10725 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10726 OnlyUsesV1 ? V1 : V2,
10727 DAG.getIntPtrConstant(0, DL));
10728 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10729 }
10730 }
10731
10732 // Otherwise form a 128-bit permutation. After accounting for undefs,
10733 // convert the 64-bit shuffle mask selection values into 128-bit
10734 // selection bits by dividing the indexes by 2 and shifting into positions
10735 // defined by a vperm2*128 instruction's immediate control byte.
10736
10737 // The immediate permute control byte looks like this:
10738 // [1:0] - select 128 bits from sources for low half of destination
10739 // [2] - ignore
10740 // [3] - zero low half of destination
10741 // [5:4] - select 128 bits from sources for high half of destination
10742 // [6] - ignore
10743 // [7] - zero high half of destination
10744
10745 int MaskLO = Mask[0];
10746 if (MaskLO == SM_SentinelUndef)
10747 MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
10748
10749 int MaskHI = Mask[2];
10750 if (MaskHI == SM_SentinelUndef)
10751 MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
10752
10753 unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
10754
10755 // If either input is a zero vector, replace it with an undef input.
10756 // Shuffle mask values < 4 are selecting elements of V1.
10757 // Shuffle mask values >= 4 are selecting elements of V2.
10758 // Adjust each half of the permute mask by clearing the half that was
10759 // selecting the zero vector and setting the zero mask bit.
10760 if (IsV1Zero) {
10761 V1 = DAG.getUNDEF(VT);
10762 if (MaskLO < 4)
10763 PermMask = (PermMask & 0xf0) | 0x08;
10764 if (MaskHI < 4)
10765 PermMask = (PermMask & 0x0f) | 0x80;
10766 }
10767 if (IsV2Zero) {
10768 V2 = DAG.getUNDEF(VT);
10769 if (MaskLO >= 4)
10770 PermMask = (PermMask & 0xf0) | 0x08;
10771 if (MaskHI >= 4)
10772 PermMask = (PermMask & 0x0f) | 0x80;
10773 }
10774
10775 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10776 DAG.getConstant(PermMask, DL, MVT::i8));
10777 }
10778
10779 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10780 /// shuffling each lane.
10781 ///
10782 /// This will only succeed when the result of fixing the 128-bit lanes results
10783 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10784 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10785 /// the lane crosses early and then use simpler shuffles within each lane.
10786 ///
10787 /// FIXME: It might be worthwhile at some point to support this without
10788 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10789 /// in x86 only floating point has interesting non-repeating shuffles, and even
10790 /// those are still *marginally* more expensive.
lowerVectorShuffleByMerging128BitLanes(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10791 static SDValue lowerVectorShuffleByMerging128BitLanes(
10792 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10793 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10794 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
10795
10796 int Size = Mask.size();
10797 int LaneSize = 128 / VT.getScalarSizeInBits();
10798 int NumLanes = Size / LaneSize;
10799 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10800
10801 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10802 // check whether the in-128-bit lane shuffles share a repeating pattern.
10803 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
10804 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
10805 for (int i = 0; i < Size; ++i) {
10806 if (Mask[i] < 0)
10807 continue;
10808
10809 int j = i / LaneSize;
10810
10811 if (Lanes[j] < 0) {
10812 // First entry we've seen for this lane.
10813 Lanes[j] = Mask[i] / LaneSize;
10814 } else if (Lanes[j] != Mask[i] / LaneSize) {
10815 // This doesn't match the lane selected previously!
10816 return SDValue();
10817 }
10818
10819 // Check that within each lane we have a consistent shuffle mask.
10820 int k = i % LaneSize;
10821 if (InLaneMask[k] < 0) {
10822 InLaneMask[k] = Mask[i] % LaneSize;
10823 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10824 // This doesn't fit a repeating in-lane mask.
10825 return SDValue();
10826 }
10827 }
10828
10829 // First shuffle the lanes into place.
10830 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10831 VT.getSizeInBits() / 64);
10832 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
10833 for (int i = 0; i < NumLanes; ++i)
10834 if (Lanes[i] >= 0) {
10835 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10836 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10837 }
10838
10839 V1 = DAG.getBitcast(LaneVT, V1);
10840 V2 = DAG.getBitcast(LaneVT, V2);
10841 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10842
10843 // Cast it back to the type we actually want.
10844 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
10845
10846 // Now do a simple shuffle that isn't lane crossing.
10847 SmallVector<int, 8> NewMask((unsigned)Size, -1);
10848 for (int i = 0; i < Size; ++i)
10849 if (Mask[i] >= 0)
10850 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10851 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10852 "Must not introduce lane crosses at this point!");
10853
10854 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10855 }
10856
10857 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
10858 /// This allows for fast cases such as subvector extraction/insertion
10859 /// or shuffling smaller vector types which can lower more efficiently.
lowerVectorShuffleWithUndefHalf(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10860 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
10861 SDValue V1, SDValue V2,
10862 ArrayRef<int> Mask,
10863 const X86Subtarget &Subtarget,
10864 SelectionDAG &DAG) {
10865 assert(VT.is256BitVector() && "Expected 256-bit vector");
10866
10867 unsigned NumElts = VT.getVectorNumElements();
10868 unsigned HalfNumElts = NumElts / 2;
10869 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
10870
10871 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
10872 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
10873 if (!UndefLower && !UndefUpper)
10874 return SDValue();
10875
10876 // Upper half is undef and lower half is whole upper subvector.
10877 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
10878 if (UndefUpper &&
10879 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
10880 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10881 DAG.getIntPtrConstant(HalfNumElts, DL));
10882 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10883 DAG.getIntPtrConstant(0, DL));
10884 }
10885
10886 // Lower half is undef and upper half is whole lower subvector.
10887 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
10888 if (UndefLower &&
10889 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
10890 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10891 DAG.getIntPtrConstant(0, DL));
10892 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10893 DAG.getIntPtrConstant(HalfNumElts, DL));
10894 }
10895
10896 // If the shuffle only uses two of the four halves of the input operands,
10897 // then extract them and perform the 'half' shuffle at half width.
10898 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
10899 int HalfIdx1 = -1, HalfIdx2 = -1;
10900 SmallVector<int, 8> HalfMask(HalfNumElts);
10901 unsigned Offset = UndefLower ? HalfNumElts : 0;
10902 for (unsigned i = 0; i != HalfNumElts; ++i) {
10903 int M = Mask[i + Offset];
10904 if (M < 0) {
10905 HalfMask[i] = M;
10906 continue;
10907 }
10908
10909 // Determine which of the 4 half vectors this element is from.
10910 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
10911 int HalfIdx = M / HalfNumElts;
10912
10913 // Determine the element index into its half vector source.
10914 int HalfElt = M % HalfNumElts;
10915
10916 // We can shuffle with up to 2 half vectors, set the new 'half'
10917 // shuffle mask accordingly.
10918 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
10919 HalfMask[i] = HalfElt;
10920 HalfIdx1 = HalfIdx;
10921 continue;
10922 }
10923 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
10924 HalfMask[i] = HalfElt + HalfNumElts;
10925 HalfIdx2 = HalfIdx;
10926 continue;
10927 }
10928
10929 // Too many half vectors referenced.
10930 return SDValue();
10931 }
10932 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
10933
10934 // Only shuffle the halves of the inputs when useful.
10935 int NumLowerHalves =
10936 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
10937 int NumUpperHalves =
10938 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
10939
10940 // uuuuXXXX - don't extract uppers just to insert again.
10941 if (UndefLower && NumUpperHalves != 0)
10942 return SDValue();
10943
10944 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
10945 if (UndefUpper && NumUpperHalves == 2)
10946 return SDValue();
10947
10948 // AVX2 - XXXXuuuu - always extract lowers.
10949 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
10950 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
10951 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10952 return SDValue();
10953 // AVX2 supports variable 32-bit element cross-lane shuffles.
10954 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
10955 // XXXXuuuu - don't extract lowers and uppers.
10956 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
10957 return SDValue();
10958 }
10959 }
10960
10961 auto GetHalfVector = [&](int HalfIdx) {
10962 if (HalfIdx < 0)
10963 return DAG.getUNDEF(HalfVT);
10964 SDValue V = (HalfIdx < 2 ? V1 : V2);
10965 HalfIdx = (HalfIdx % 2) * HalfNumElts;
10966 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
10967 DAG.getIntPtrConstant(HalfIdx, DL));
10968 };
10969
10970 SDValue Half1 = GetHalfVector(HalfIdx1);
10971 SDValue Half2 = GetHalfVector(HalfIdx2);
10972 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
10973 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
10974 DAG.getIntPtrConstant(Offset, DL));
10975 }
10976
10977 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10978 /// given mask.
10979 ///
10980 /// This returns true if the elements from a particular input are already in the
10981 /// slot required by the given mask and require no permutation.
isShuffleMaskInputInPlace(int Input,ArrayRef<int> Mask)10982 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10983 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10984 int Size = Mask.size();
10985 for (int i = 0; i < Size; ++i)
10986 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10987 return false;
10988
10989 return true;
10990 }
10991
10992 /// Handle case where shuffle sources are coming from the same 128-bit lane and
10993 /// every lane can be represented as the same repeating mask - allowing us to
10994 /// shuffle the sources with the repeating shuffle and then permute the result
10995 /// to the destination lanes.
lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10996 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
10997 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10998 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10999 int NumElts = VT.getVectorNumElements();
11000 int NumLanes = VT.getSizeInBits() / 128;
11001 int NumLaneElts = NumElts / NumLanes;
11002
11003 // On AVX2 we may be able to just shuffle the lowest elements and then
11004 // broadcast the result.
11005 if (Subtarget.hasAVX2()) {
11006 for (unsigned BroadcastSize : {16, 32, 64}) {
11007 if (BroadcastSize <= VT.getScalarSizeInBits())
11008 continue;
11009 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11010
11011 // Attempt to match a repeating pattern every NumBroadcastElts,
11012 // accounting for UNDEFs but only references the lowest 128-bit
11013 // lane of the inputs.
11014 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11015 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11016 for (int j = 0; j != NumBroadcastElts; ++j) {
11017 int M = Mask[i + j];
11018 if (M < 0)
11019 continue;
11020 int &R = RepeatMask[j];
11021 if (0 != ((M % NumElts) / NumLaneElts))
11022 return false;
11023 if (0 <= R && R != M)
11024 return false;
11025 R = M;
11026 }
11027 return true;
11028 };
11029
11030 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11031 if (!FindRepeatingBroadcastMask(RepeatMask))
11032 continue;
11033
11034 // Shuffle the (lowest) repeated elements in place for broadcast.
11035 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11036
11037 // Shuffle the actual broadcast.
11038 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11039 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11040 for (int j = 0; j != NumBroadcastElts; ++j)
11041 BroadcastMask[i + j] = j;
11042 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11043 BroadcastMask);
11044 }
11045 }
11046
11047 // Bail if we already have a repeated lane shuffle mask.
11048 SmallVector<int, 8> RepeatedShuffleMask;
11049 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11050 return SDValue();
11051
11052 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11053 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11054 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11055 int NumSubLanes = NumLanes * SubLaneScale;
11056 int NumSubLaneElts = NumLaneElts / SubLaneScale;
11057
11058 // Check that all the sources are coming from the same lane and see if we
11059 // can form a repeating shuffle mask (local to each lane). At the same time,
11060 // determine the source sub-lane for each destination sub-lane.
11061 int TopSrcSubLane = -1;
11062 SmallVector<int, 8> RepeatedLaneMask((unsigned)NumLaneElts, -1);
11063 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11064 for (int i = 0; i != NumElts; ++i) {
11065 int M = Mask[i];
11066 if (M < 0)
11067 continue;
11068 assert(0 <= M && M < 2 * NumElts);
11069
11070 // Check that the local mask index is the same for every lane. We always do
11071 // this with 128-bit lanes to match in is128BitLaneRepeatedShuffleMask.
11072 int LocalM = M < NumElts ? (M % NumLaneElts) : (M % NumLaneElts) + NumElts;
11073 int &RepeatM = RepeatedLaneMask[i % NumLaneElts];
11074 if (0 <= RepeatM && RepeatM != LocalM)
11075 return SDValue();
11076 RepeatM = LocalM;
11077
11078 // Check that the whole of each destination sub-lane comes from the same
11079 // sub-lane, we need to calculate the source based off where the repeated
11080 // lane mask will have left it.
11081 int SrcLane = (M % NumElts) / NumLaneElts;
11082 int SrcSubLane = (SrcLane * SubLaneScale) +
11083 ((i % NumLaneElts) / NumSubLaneElts);
11084 int &Dst2SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11085 if (0 <= Dst2SrcSubLane && SrcSubLane != Dst2SrcSubLane)
11086 return SDValue();
11087 Dst2SrcSubLane = SrcSubLane;
11088
11089 // Track the top most source sub-lane - by setting the remaining to UNDEF
11090 // we can greatly simplify shuffle matching.
11091 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11092 }
11093 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11094 "Unexpected source lane");
11095
11096 // Create a repeating shuffle mask for the entire vector.
11097 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11098 for (int i = 0, e = ((TopSrcSubLane + 1) * NumSubLaneElts); i != e; ++i) {
11099 int M = RepeatedLaneMask[i % NumLaneElts];
11100 if (M < 0)
11101 continue;
11102 int Lane = i / NumLaneElts;
11103 RepeatedMask[i] = M + (Lane * NumLaneElts);
11104 }
11105 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11106
11107 // Shuffle each source sub-lane to its destination.
11108 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11109 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11110 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11111 if (SrcSubLane < 0)
11112 continue;
11113 for (int j = 0; j != NumSubLaneElts; ++j)
11114 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11115 }
11116
11117 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11118 SubLaneMask);
11119 }
11120
lowerVectorShuffleWithSHUFPD(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11121 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11122 ArrayRef<int> Mask, SDValue V1,
11123 SDValue V2, SelectionDAG &DAG) {
11124
11125 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
11126 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
11127 assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
11128 int NumElts = VT.getVectorNumElements();
11129 bool ShufpdMask = true;
11130 bool CommutableMask = true;
11131 unsigned Immediate = 0;
11132 for (int i = 0; i < NumElts; ++i) {
11133 if (Mask[i] < 0)
11134 continue;
11135 int Val = (i & 6) + NumElts * (i & 1);
11136 int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
11137 if (Mask[i] < Val || Mask[i] > Val + 1)
11138 ShufpdMask = false;
11139 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
11140 CommutableMask = false;
11141 Immediate |= (Mask[i] % 2) << i;
11142 }
11143 if (ShufpdMask)
11144 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11145 DAG.getConstant(Immediate, DL, MVT::i8));
11146 if (CommutableMask)
11147 return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11148 DAG.getConstant(Immediate, DL, MVT::i8));
11149 return SDValue();
11150 }
11151
11152 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
11153 ///
11154 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
11155 /// isn't available.
lowerV4F64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11156 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11157 SDValue V1, SDValue V2,
11158 const X86Subtarget &Subtarget,
11159 SelectionDAG &DAG) {
11160 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11161 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11162 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11163
11164 SmallVector<int, 4> WidenedMask;
11165 if (canWidenShuffleElements(Mask, WidenedMask))
11166 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
11167 Subtarget, DAG))
11168 return V;
11169
11170 if (V2.isUndef()) {
11171 // Check for being able to broadcast a single element.
11172 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11173 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11174 return Broadcast;
11175
11176 // Use low duplicate instructions for masks that match their pattern.
11177 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11178 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
11179
11180 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
11181 // Non-half-crossing single input shuffles can be lowered with an
11182 // interleaved permutation.
11183 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11184 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
11185 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
11186 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11187 }
11188
11189 // With AVX2 we have direct support for this permutation.
11190 if (Subtarget.hasAVX2())
11191 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
11192 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11193
11194 // Try to create an in-lane repeating shuffle mask and then shuffle the
11195 // the results into the target lanes.
11196 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11197 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11198 return V;
11199
11200 // Otherwise, fall back.
11201 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
11202 DAG);
11203 }
11204
11205 // Use dedicated unpack instructions for masks that match their pattern.
11206 if (SDValue V =
11207 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
11208 return V;
11209
11210 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
11211 Subtarget, DAG))
11212 return Blend;
11213
11214 // Check if the blend happens to exactly fit that of SHUFPD.
11215 if (SDValue Op =
11216 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
11217 return Op;
11218
11219 // Try to create an in-lane repeating shuffle mask and then shuffle the
11220 // the results into the target lanes.
11221 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11222 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11223 return V;
11224
11225 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11226 // shuffle. However, if we have AVX2 and either inputs are already in place,
11227 // we will be able to shuffle even across lanes the other input in a single
11228 // instruction so skip this pattern.
11229 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11230 isShuffleMaskInputInPlace(1, Mask))))
11231 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11232 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11233 return Result;
11234
11235 // If we have AVX2 then we always want to lower with a blend because an v4 we
11236 // can fully permute the elements.
11237 if (Subtarget.hasAVX2())
11238 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
11239 Mask, DAG);
11240
11241 // Otherwise fall back on generic lowering.
11242 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
11243 }
11244
11245 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
11246 ///
11247 /// This routine is only called when we have AVX2 and thus a reasonable
11248 /// instruction set for v4i64 shuffling..
lowerV4I64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11249 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11250 SDValue V1, SDValue V2,
11251 const X86Subtarget &Subtarget,
11252 SelectionDAG &DAG) {
11253 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11254 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11255 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11256 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
11257
11258 SmallVector<int, 4> WidenedMask;
11259 if (canWidenShuffleElements(Mask, WidenedMask))
11260 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
11261 Subtarget, DAG))
11262 return V;
11263
11264 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
11265 Subtarget, DAG))
11266 return Blend;
11267
11268 // Check for being able to broadcast a single element.
11269 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
11270 Mask, Subtarget, DAG))
11271 return Broadcast;
11272
11273 if (V2.isUndef()) {
11274 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11275 // can use lower latency instructions that will operate on both lanes.
11276 SmallVector<int, 2> RepeatedMask;
11277 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
11278 SmallVector<int, 4> PSHUFDMask;
11279 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
11280 return DAG.getBitcast(
11281 MVT::v4i64,
11282 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
11283 DAG.getBitcast(MVT::v8i32, V1),
11284 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11285 }
11286
11287 // AVX2 provides a direct instruction for permuting a single input across
11288 // lanes.
11289 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
11290 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11291 }
11292
11293 // Try to use shift instructions.
11294 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
11295 Subtarget, DAG))
11296 return Shift;
11297
11298 // Use dedicated unpack instructions for masks that match their pattern.
11299 if (SDValue V =
11300 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
11301 return V;
11302
11303 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11304 // shuffle. However, if we have AVX2 and either inputs are already in place,
11305 // we will be able to shuffle even across lanes the other input in a single
11306 // instruction so skip this pattern.
11307 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11308 isShuffleMaskInputInPlace(1, Mask))))
11309 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11310 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
11311 return Result;
11312
11313 // Otherwise fall back on generic blend lowering.
11314 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
11315 Mask, DAG);
11316 }
11317
11318 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
11319 ///
11320 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
11321 /// isn't available.
lowerV8F32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11322 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11323 SDValue V1, SDValue V2,
11324 const X86Subtarget &Subtarget,
11325 SelectionDAG &DAG) {
11326 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11327 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11328 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11329
11330 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
11331 Subtarget, DAG))
11332 return Blend;
11333
11334 // Check for being able to broadcast a single element.
11335 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
11336 Mask, Subtarget, DAG))
11337 return Broadcast;
11338
11339 // If the shuffle mask is repeated in each 128-bit lane, we have many more
11340 // options to efficiently lower the shuffle.
11341 SmallVector<int, 4> RepeatedMask;
11342 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
11343 assert(RepeatedMask.size() == 4 &&
11344 "Repeated masks must be half the mask width!");
11345
11346 // Use even/odd duplicate instructions for masks that match their pattern.
11347 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11348 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
11349 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11350 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
11351
11352 if (V2.isUndef())
11353 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
11354 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11355
11356 // Use dedicated unpack instructions for masks that match their pattern.
11357 if (SDValue V =
11358 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
11359 return V;
11360
11361 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
11362 // have already handled any direct blends.
11363 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
11364 }
11365
11366 // Try to create an in-lane repeating shuffle mask and then shuffle the
11367 // the results into the target lanes.
11368 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11369 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11370 return V;
11371
11372 // If we have a single input shuffle with different shuffle patterns in the
11373 // two 128-bit lanes use the variable mask to VPERMILPS.
11374 if (V2.isUndef()) {
11375 SDValue VPermMask[8];
11376 for (int i = 0; i < 8; ++i)
11377 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11378 : DAG.getConstant(Mask[i], DL, MVT::i32);
11379 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
11380 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
11381 DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
11382
11383 if (Subtarget.hasAVX2())
11384 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
11385 DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11386
11387 // Otherwise, fall back.
11388 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
11389 DAG);
11390 }
11391
11392 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11393 // shuffle.
11394 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11395 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11396 return Result;
11397
11398 // If we have AVX2 then we always want to lower with a blend because at v8 we
11399 // can fully permute the elements.
11400 if (Subtarget.hasAVX2())
11401 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
11402 Mask, DAG);
11403
11404 // Otherwise fall back on generic lowering.
11405 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
11406 }
11407
11408 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
11409 ///
11410 /// This routine is only called when we have AVX2 and thus a reasonable
11411 /// instruction set for v8i32 shuffling..
lowerV8I32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11412 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11413 SDValue V1, SDValue V2,
11414 const X86Subtarget &Subtarget,
11415 SelectionDAG &DAG) {
11416 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11417 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11418 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11419 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
11420
11421 // Whenever we can lower this as a zext, that instruction is strictly faster
11422 // than any alternative. It also allows us to fold memory operands into the
11423 // shuffle in many cases.
11424 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
11425 Mask, Subtarget, DAG))
11426 return ZExt;
11427
11428 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
11429 Subtarget, DAG))
11430 return Blend;
11431
11432 // Check for being able to broadcast a single element.
11433 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
11434 Mask, Subtarget, DAG))
11435 return Broadcast;
11436
11437 // If the shuffle mask is repeated in each 128-bit lane we can use more
11438 // efficient instructions that mirror the shuffles across the two 128-bit
11439 // lanes.
11440 SmallVector<int, 4> RepeatedMask;
11441 if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
11442 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11443 if (V2.isUndef())
11444 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
11445 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11446
11447 // Use dedicated unpack instructions for masks that match their pattern.
11448 if (SDValue V =
11449 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
11450 return V;
11451 }
11452
11453 // Try to use shift instructions.
11454 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
11455 Subtarget, DAG))
11456 return Shift;
11457
11458 // Try to use byte rotation instructions.
11459 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11460 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11461 return Rotate;
11462
11463 // Try to create an in-lane repeating shuffle mask and then shuffle the
11464 // the results into the target lanes.
11465 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11466 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11467 return V;
11468
11469 // If the shuffle patterns aren't repeated but it is a single input, directly
11470 // generate a cross-lane VPERMD instruction.
11471 if (V2.isUndef()) {
11472 SDValue VPermMask[8];
11473 for (int i = 0; i < 8; ++i)
11474 VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11475 : DAG.getConstant(Mask[i], DL, MVT::i32);
11476 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
11477 DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11478 }
11479
11480 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11481 // shuffle.
11482 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11483 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11484 return Result;
11485
11486 // Otherwise fall back on generic blend lowering.
11487 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
11488 Mask, DAG);
11489 }
11490
11491 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
11492 ///
11493 /// This routine is only called when we have AVX2 and thus a reasonable
11494 /// instruction set for v16i16 shuffling..
lowerV16I16VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11495 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11496 SDValue V1, SDValue V2,
11497 const X86Subtarget &Subtarget,
11498 SelectionDAG &DAG) {
11499 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11500 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11501 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11502 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
11503
11504 // Whenever we can lower this as a zext, that instruction is strictly faster
11505 // than any alternative. It also allows us to fold memory operands into the
11506 // shuffle in many cases.
11507 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
11508 Mask, Subtarget, DAG))
11509 return ZExt;
11510
11511 // Check for being able to broadcast a single element.
11512 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
11513 Mask, Subtarget, DAG))
11514 return Broadcast;
11515
11516 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
11517 Subtarget, DAG))
11518 return Blend;
11519
11520 // Use dedicated unpack instructions for masks that match their pattern.
11521 if (SDValue V =
11522 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
11523 return V;
11524
11525 // Try to use shift instructions.
11526 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
11527 Subtarget, DAG))
11528 return Shift;
11529
11530 // Try to use byte rotation instructions.
11531 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11532 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11533 return Rotate;
11534
11535 // Try to create an in-lane repeating shuffle mask and then shuffle the
11536 // the results into the target lanes.
11537 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11538 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11539 return V;
11540
11541 if (V2.isUndef()) {
11542 // There are no generalized cross-lane shuffle operations available on i16
11543 // element types.
11544 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
11545 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
11546 Mask, DAG);
11547
11548 SmallVector<int, 8> RepeatedMask;
11549 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11550 // As this is a single-input shuffle, the repeated mask should be
11551 // a strictly valid v8i16 mask that we can pass through to the v8i16
11552 // lowering to handle even the v16 case.
11553 return lowerV8I16GeneralSingleInputVectorShuffle(
11554 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
11555 }
11556 }
11557
11558 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
11559 V2, Subtarget, DAG))
11560 return PSHUFB;
11561
11562 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11563 // shuffle.
11564 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11565 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11566 return Result;
11567
11568 // Otherwise fall back on generic lowering.
11569 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
11570 }
11571
11572 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
11573 ///
11574 /// This routine is only called when we have AVX2 and thus a reasonable
11575 /// instruction set for v32i8 shuffling..
lowerV32I8VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11576 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11577 SDValue V1, SDValue V2,
11578 const X86Subtarget &Subtarget,
11579 SelectionDAG &DAG) {
11580 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11581 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11582 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11583 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
11584
11585 // Whenever we can lower this as a zext, that instruction is strictly faster
11586 // than any alternative. It also allows us to fold memory operands into the
11587 // shuffle in many cases.
11588 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
11589 Mask, Subtarget, DAG))
11590 return ZExt;
11591
11592 // Check for being able to broadcast a single element.
11593 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
11594 Mask, Subtarget, DAG))
11595 return Broadcast;
11596
11597 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
11598 Subtarget, DAG))
11599 return Blend;
11600
11601 // Use dedicated unpack instructions for masks that match their pattern.
11602 if (SDValue V =
11603 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
11604 return V;
11605
11606 // Try to use shift instructions.
11607 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
11608 Subtarget, DAG))
11609 return Shift;
11610
11611 // Try to use byte rotation instructions.
11612 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11613 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11614 return Rotate;
11615
11616 // Try to create an in-lane repeating shuffle mask and then shuffle the
11617 // the results into the target lanes.
11618 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11619 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11620 return V;
11621
11622 // There are no generalized cross-lane shuffle operations available on i8
11623 // element types.
11624 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
11625 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
11626 DAG);
11627
11628 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
11629 V2, Subtarget, DAG))
11630 return PSHUFB;
11631
11632 // Try to simplify this by merging 128-bit lanes to enable a lane-based
11633 // shuffle.
11634 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11635 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11636 return Result;
11637
11638 // Otherwise fall back on generic lowering.
11639 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
11640 }
11641
11642 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
11643 ///
11644 /// This routine either breaks down the specific type of a 256-bit x86 vector
11645 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
11646 /// together based on the available instructions.
lower256BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11647 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11648 MVT VT, SDValue V1, SDValue V2,
11649 const X86Subtarget &Subtarget,
11650 SelectionDAG &DAG) {
11651 // If we have a single input to the zero element, insert that into V1 if we
11652 // can do so cheaply.
11653 int NumElts = VT.getVectorNumElements();
11654 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
11655
11656 if (NumV2Elements == 1 && Mask[0] >= NumElts)
11657 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11658 DL, VT, V1, V2, Mask, Subtarget, DAG))
11659 return Insertion;
11660
11661 // Handle special cases where the lower or upper half is UNDEF.
11662 if (SDValue V =
11663 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
11664 return V;
11665
11666 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
11667 // can check for those subtargets here and avoid much of the subtarget
11668 // querying in the per-vector-type lowering routines. With AVX1 we have
11669 // essentially *zero* ability to manipulate a 256-bit vector with integer
11670 // types. Since we'll use floating point types there eventually, just
11671 // immediately cast everything to a float and operate entirely in that domain.
11672 if (VT.isInteger() && !Subtarget.hasAVX2()) {
11673 int ElementBits = VT.getScalarSizeInBits();
11674 if (ElementBits < 32) {
11675 // No floating point type available, if we can't use the bit operations
11676 // for masking/blending then decompose into 128-bit vectors.
11677 if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
11678 return V;
11679 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11680 return V;
11681 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11682 }
11683
11684 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
11685 VT.getVectorNumElements());
11686 V1 = DAG.getBitcast(FpVT, V1);
11687 V2 = DAG.getBitcast(FpVT, V2);
11688 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
11689 }
11690
11691 switch (VT.SimpleTy) {
11692 case MVT::v4f64:
11693 return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11694 case MVT::v4i64:
11695 return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11696 case MVT::v8f32:
11697 return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11698 case MVT::v8i32:
11699 return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11700 case MVT::v16i16:
11701 return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11702 case MVT::v32i8:
11703 return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11704
11705 default:
11706 llvm_unreachable("Not a valid 256-bit x86 vector type!");
11707 }
11708 }
11709
11710 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
lowerV4X128VectorShuffle(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11711 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
11712 ArrayRef<int> Mask, SDValue V1,
11713 SDValue V2, SelectionDAG &DAG) {
11714 assert(VT.getScalarSizeInBits() == 64 &&
11715 "Unexpected element type size for 128bit shuffle.");
11716
11717 // To handle 256 bit vector requires VLX and most probably
11718 // function lowerV2X128VectorShuffle() is better solution.
11719 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
11720
11721 SmallVector<int, 4> WidenedMask;
11722 if (!canWidenShuffleElements(Mask, WidenedMask))
11723 return SDValue();
11724
11725 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11726 // Insure elements came from the same Op.
11727 int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
11728 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11729 if (WidenedMask[i] == SM_SentinelZero)
11730 return SDValue();
11731 if (WidenedMask[i] == SM_SentinelUndef)
11732 continue;
11733
11734 SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
11735 unsigned OpIndex = (i < Size/2) ? 0 : 1;
11736 if (Ops[OpIndex].isUndef())
11737 Ops[OpIndex] = Op;
11738 else if (Ops[OpIndex] != Op)
11739 return SDValue();
11740 }
11741
11742 // Form a 128-bit permutation.
11743 // Convert the 64-bit shuffle mask selection values into 128-bit selection
11744 // bits defined by a vshuf64x2 instruction's immediate control byte.
11745 unsigned PermMask = 0, Imm = 0;
11746 unsigned ControlBitsNum = WidenedMask.size() / 2;
11747
11748 for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11749 // Use first element in place of undef mask.
11750 Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
11751 PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
11752 }
11753
11754 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
11755 DAG.getConstant(PermMask, DL, MVT::i8));
11756 }
11757
lowerVectorShuffleWithPERMV(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11758 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11759 ArrayRef<int> Mask, SDValue V1,
11760 SDValue V2, SelectionDAG &DAG) {
11761
11762 assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
11763
11764 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11765 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11766
11767 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11768 if (V2.isUndef())
11769 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11770
11771 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
11772 }
11773
11774 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
lowerV8F64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11775 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11776 SDValue V1, SDValue V2,
11777 const X86Subtarget &Subtarget,
11778 SelectionDAG &DAG) {
11779 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11780 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11781 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11782
11783 if (V2.isUndef()) {
11784 // Use low duplicate instructions for masks that match their pattern.
11785 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
11786 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
11787
11788 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
11789 // Non-half-crossing single input shuffles can be lowered with an
11790 // interleaved permutation.
11791 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11792 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
11793 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
11794 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
11795 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
11796 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11797 }
11798
11799 SmallVector<int, 4> RepeatedMask;
11800 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
11801 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
11802 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11803 }
11804
11805 if (SDValue Shuf128 =
11806 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
11807 return Shuf128;
11808
11809 if (SDValue Unpck =
11810 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
11811 return Unpck;
11812
11813 // Check if the blend happens to exactly fit that of SHUFPD.
11814 if (SDValue Op =
11815 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
11816 return Op;
11817
11818 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
11819 }
11820
11821 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
lowerV16F32VectorShuffle(SDLoc DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11822 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
11823 SDValue V1, SDValue V2,
11824 const X86Subtarget &Subtarget,
11825 SelectionDAG &DAG) {
11826 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11827 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11828 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11829
11830 // If the shuffle mask is repeated in each 128-bit lane, we have many more
11831 // options to efficiently lower the shuffle.
11832 SmallVector<int, 4> RepeatedMask;
11833 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
11834 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11835
11836 // Use even/odd duplicate instructions for masks that match their pattern.
11837 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11838 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
11839 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11840 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
11841
11842 if (V2.isUndef())
11843 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
11844 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11845
11846 // Use dedicated unpack instructions for masks that match their pattern.
11847 if (SDValue Unpck =
11848 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
11849 return Unpck;
11850
11851 // Otherwise, fall back to a SHUFPS sequence.
11852 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
11853 }
11854
11855 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
11856 }
11857
11858 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
lowerV8I64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11859 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11860 SDValue V1, SDValue V2,
11861 const X86Subtarget &Subtarget,
11862 SelectionDAG &DAG) {
11863 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11864 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11865 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11866
11867 if (SDValue Shuf128 =
11868 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
11869 return Shuf128;
11870
11871 if (V2.isUndef()) {
11872 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11873 // can use lower latency instructions that will operate on all four
11874 // 128-bit lanes.
11875 SmallVector<int, 2> Repeated128Mask;
11876 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
11877 SmallVector<int, 4> PSHUFDMask;
11878 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
11879 return DAG.getBitcast(
11880 MVT::v8i64,
11881 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
11882 DAG.getBitcast(MVT::v16i32, V1),
11883 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11884 }
11885
11886 SmallVector<int, 4> Repeated256Mask;
11887 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
11888 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
11889 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
11890 }
11891
11892 // Try to use shift instructions.
11893 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
11894 Subtarget, DAG))
11895 return Shift;
11896
11897 if (SDValue Unpck =
11898 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
11899 return Unpck;
11900
11901 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
11902 }
11903
11904 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
lowerV16I32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11905 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11906 SDValue V1, SDValue V2,
11907 const X86Subtarget &Subtarget,
11908 SelectionDAG &DAG) {
11909 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11910 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11911 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11912
11913 // If the shuffle mask is repeated in each 128-bit lane we can use more
11914 // efficient instructions that mirror the shuffles across the four 128-bit
11915 // lanes.
11916 SmallVector<int, 4> RepeatedMask;
11917 if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
11918 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11919 if (V2.isUndef())
11920 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
11921 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11922
11923 // Use dedicated unpack instructions for masks that match their pattern.
11924 if (SDValue V =
11925 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
11926 return V;
11927 }
11928
11929 // Try to use shift instructions.
11930 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
11931 Subtarget, DAG))
11932 return Shift;
11933
11934 // Try to use byte rotation instructions.
11935 if (Subtarget.hasBWI())
11936 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11937 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
11938 return Rotate;
11939
11940 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
11941 }
11942
11943 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
lowerV32I16VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11944 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11945 SDValue V1, SDValue V2,
11946 const X86Subtarget &Subtarget,
11947 SelectionDAG &DAG) {
11948 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11949 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11950 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11951 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11952
11953 // Use dedicated unpack instructions for masks that match their pattern.
11954 if (SDValue V =
11955 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
11956 return V;
11957
11958 // Try to use shift instructions.
11959 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
11960 Subtarget, DAG))
11961 return Shift;
11962
11963 // Try to use byte rotation instructions.
11964 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11965 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
11966 return Rotate;
11967
11968 if (V2.isUndef()) {
11969 SmallVector<int, 8> RepeatedMask;
11970 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
11971 // As this is a single-input shuffle, the repeated mask should be
11972 // a strictly valid v8i16 mask that we can pass through to the v8i16
11973 // lowering to handle even the v32 case.
11974 return lowerV8I16GeneralSingleInputVectorShuffle(
11975 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
11976 }
11977 }
11978
11979 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
11980 }
11981
11982 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
lowerV64I8VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11983 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11984 SDValue V1, SDValue V2,
11985 const X86Subtarget &Subtarget,
11986 SelectionDAG &DAG) {
11987 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11988 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11989 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11990 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11991
11992 // Use dedicated unpack instructions for masks that match their pattern.
11993 if (SDValue V =
11994 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
11995 return V;
11996
11997 // Try to use shift instructions.
11998 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
11999 Subtarget, DAG))
12000 return Shift;
12001
12002 // Try to use byte rotation instructions.
12003 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12004 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12005 return Rotate;
12006
12007 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
12008 V2, Subtarget, DAG))
12009 return PSHUFB;
12010
12011 // FIXME: Implement direct support for this type!
12012 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12013 }
12014
12015 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12016 ///
12017 /// This routine either breaks down the specific type of a 512-bit x86 vector
12018 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
12019 /// together based on the available instructions.
lower512BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)12020 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12021 MVT VT, SDValue V1, SDValue V2,
12022 const X86Subtarget &Subtarget,
12023 SelectionDAG &DAG) {
12024 assert(Subtarget.hasAVX512() &&
12025 "Cannot lower 512-bit vectors w/ basic ISA!");
12026
12027 // Check for being able to broadcast a single element.
12028 if (SDValue Broadcast =
12029 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
12030 return Broadcast;
12031
12032 // Dispatch to each element type for lowering. If we don't have support for
12033 // specific element type shuffles at 512 bits, immediately split them and
12034 // lower them. Each lowering routine of a given type is allowed to assume that
12035 // the requisite ISA extensions for that element type are available.
12036 switch (VT.SimpleTy) {
12037 case MVT::v8f64:
12038 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12039 case MVT::v16f32:
12040 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12041 case MVT::v8i64:
12042 return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12043 case MVT::v16i32:
12044 return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12045 case MVT::v32i16:
12046 return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12047 case MVT::v64i8:
12048 return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12049
12050 default:
12051 llvm_unreachable("Not a valid 512-bit x86 vector type!");
12052 }
12053 }
12054
12055 // Lower vXi1 vector shuffles.
12056 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
12057 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
12058 // vector, shuffle and then truncate it back.
lower1BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)12059 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12060 MVT VT, SDValue V1, SDValue V2,
12061 const X86Subtarget &Subtarget,
12062 SelectionDAG &DAG) {
12063 assert(Subtarget.hasAVX512() &&
12064 "Cannot lower 512-bit vectors w/o basic ISA!");
12065 MVT ExtVT;
12066 switch (VT.SimpleTy) {
12067 default:
12068 llvm_unreachable("Expected a vector of i1 elements");
12069 case MVT::v2i1:
12070 ExtVT = MVT::v2i64;
12071 break;
12072 case MVT::v4i1:
12073 ExtVT = MVT::v4i32;
12074 break;
12075 case MVT::v8i1:
12076 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
12077 break;
12078 case MVT::v16i1:
12079 ExtVT = MVT::v16i32;
12080 break;
12081 case MVT::v32i1:
12082 ExtVT = MVT::v32i16;
12083 break;
12084 case MVT::v64i1:
12085 ExtVT = MVT::v64i8;
12086 break;
12087 }
12088
12089 if (ISD::isBuildVectorAllZeros(V1.getNode()))
12090 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12091 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
12092 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12093 else
12094 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
12095
12096 if (V2.isUndef())
12097 V2 = DAG.getUNDEF(ExtVT);
12098 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
12099 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12100 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
12101 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12102 else
12103 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
12104 return DAG.getNode(ISD::TRUNCATE, DL, VT,
12105 DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
12106 }
12107 /// \brief Top-level lowering for x86 vector shuffles.
12108 ///
12109 /// This handles decomposition, canonicalization, and lowering of all x86
12110 /// vector shuffles. Most of the specific lowering strategies are encapsulated
12111 /// above in helper routines. The canonicalization attempts to widen shuffles
12112 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
12113 /// s.t. only one of the two inputs needs to be tested, etc.
lowerVectorShuffle(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12114 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
12115 SelectionDAG &DAG) {
12116 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12117 ArrayRef<int> Mask = SVOp->getMask();
12118 SDValue V1 = Op.getOperand(0);
12119 SDValue V2 = Op.getOperand(1);
12120 MVT VT = Op.getSimpleValueType();
12121 int NumElements = VT.getVectorNumElements();
12122 SDLoc DL(Op);
12123 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
12124
12125 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
12126 "Can't lower MMX shuffles");
12127
12128 bool V1IsUndef = V1.isUndef();
12129 bool V2IsUndef = V2.isUndef();
12130 if (V1IsUndef && V2IsUndef)
12131 return DAG.getUNDEF(VT);
12132
12133 // When we create a shuffle node we put the UNDEF node to second operand,
12134 // but in some cases the first operand may be transformed to UNDEF.
12135 // In this case we should just commute the node.
12136 if (V1IsUndef)
12137 return DAG.getCommutedVectorShuffle(*SVOp);
12138
12139 // Check for non-undef masks pointing at an undef vector and make the masks
12140 // undef as well. This makes it easier to match the shuffle based solely on
12141 // the mask.
12142 if (V2IsUndef)
12143 for (int M : Mask)
12144 if (M >= NumElements) {
12145 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
12146 for (int &M : NewMask)
12147 if (M >= NumElements)
12148 M = -1;
12149 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
12150 }
12151
12152 // We actually see shuffles that are entirely re-arrangements of a set of
12153 // zero inputs. This mostly happens while decomposing complex shuffles into
12154 // simple ones. Directly lower these as a buildvector of zeros.
12155 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
12156 if (Zeroable.all())
12157 return getZeroVector(VT, Subtarget, DAG, DL);
12158
12159 // Try to collapse shuffles into using a vector type with fewer elements but
12160 // wider element types. We cap this to not form integers or floating point
12161 // elements wider than 64 bits, but it might be interesting to form i128
12162 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
12163 SmallVector<int, 16> WidenedMask;
12164 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
12165 canWidenShuffleElements(Mask, WidenedMask)) {
12166 MVT NewEltVT = VT.isFloatingPoint()
12167 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
12168 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
12169 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12170 // Make sure that the new vector type is legal. For example, v2f64 isn't
12171 // legal on SSE1.
12172 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12173 V1 = DAG.getBitcast(NewVT, V1);
12174 V2 = DAG.getBitcast(NewVT, V2);
12175 return DAG.getBitcast(
12176 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
12177 }
12178 }
12179
12180 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
12181 for (int M : Mask)
12182 if (M < 0)
12183 ++NumUndefElements;
12184 else if (M < NumElements)
12185 ++NumV1Elements;
12186 else
12187 ++NumV2Elements;
12188
12189 // Commute the shuffle as needed such that more elements come from V1 than
12190 // V2. This allows us to match the shuffle pattern strictly on how many
12191 // elements come from V1 without handling the symmetric cases.
12192 if (NumV2Elements > NumV1Elements)
12193 return DAG.getCommutedVectorShuffle(*SVOp);
12194
12195 assert(NumV1Elements > 0 && "No V1 indices");
12196 assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
12197
12198 // When the number of V1 and V2 elements are the same, try to minimize the
12199 // number of uses of V2 in the low half of the vector. When that is tied,
12200 // ensure that the sum of indices for V1 is equal to or lower than the sum
12201 // indices for V2. When those are equal, try to ensure that the number of odd
12202 // indices for V1 is lower than the number of odd indices for V2.
12203 if (NumV1Elements == NumV2Elements) {
12204 int LowV1Elements = 0, LowV2Elements = 0;
12205 for (int M : Mask.slice(0, NumElements / 2))
12206 if (M >= NumElements)
12207 ++LowV2Elements;
12208 else if (M >= 0)
12209 ++LowV1Elements;
12210 if (LowV2Elements > LowV1Elements)
12211 return DAG.getCommutedVectorShuffle(*SVOp);
12212 if (LowV2Elements == LowV1Elements) {
12213 int SumV1Indices = 0, SumV2Indices = 0;
12214 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12215 if (Mask[i] >= NumElements)
12216 SumV2Indices += i;
12217 else if (Mask[i] >= 0)
12218 SumV1Indices += i;
12219 if (SumV2Indices < SumV1Indices)
12220 return DAG.getCommutedVectorShuffle(*SVOp);
12221 if (SumV2Indices == SumV1Indices) {
12222 int NumV1OddIndices = 0, NumV2OddIndices = 0;
12223 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12224 if (Mask[i] >= NumElements)
12225 NumV2OddIndices += i % 2;
12226 else if (Mask[i] >= 0)
12227 NumV1OddIndices += i % 2;
12228 if (NumV2OddIndices < NumV1OddIndices)
12229 return DAG.getCommutedVectorShuffle(*SVOp);
12230 }
12231 }
12232 }
12233
12234 // For each vector width, delegate to a specialized lowering routine.
12235 if (VT.is128BitVector())
12236 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12237
12238 if (VT.is256BitVector())
12239 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12240
12241 if (VT.is512BitVector())
12242 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12243
12244 if (Is1BitVector)
12245 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12246
12247 llvm_unreachable("Unimplemented!");
12248 }
12249
12250 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
lowerVSELECTtoVectorShuffle(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12251 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
12252 const X86Subtarget &Subtarget,
12253 SelectionDAG &DAG) {
12254 SDValue Cond = Op.getOperand(0);
12255 SDValue LHS = Op.getOperand(1);
12256 SDValue RHS = Op.getOperand(2);
12257 SDLoc dl(Op);
12258 MVT VT = Op.getSimpleValueType();
12259
12260 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12261 return SDValue();
12262 auto *CondBV = cast<BuildVectorSDNode>(Cond);
12263
12264 // Only non-legal VSELECTs reach this lowering, convert those into generic
12265 // shuffles and re-use the shuffle lowering path for blends.
12266 SmallVector<int, 32> Mask;
12267 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
12268 SDValue CondElt = CondBV->getOperand(i);
12269 Mask.push_back(
12270 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
12271 : -1);
12272 }
12273 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
12274 }
12275
LowerVSELECT(SDValue Op,SelectionDAG & DAG) const12276 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12277 // A vselect where all conditions and data are constants can be optimized into
12278 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12279 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12280 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12281 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12282 return SDValue();
12283
12284 // Try to lower this to a blend-style vector shuffle. This can handle all
12285 // constant condition cases.
12286 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
12287 return BlendOp;
12288
12289 // Variable blends are only legal from SSE4.1 onward.
12290 if (!Subtarget.hasSSE41())
12291 return SDValue();
12292
12293 // Only some types will be legal on some subtargets. If we can emit a legal
12294 // VSELECT-matching blend, return Op, and but if we need to expand, return
12295 // a null value.
12296 switch (Op.getSimpleValueType().SimpleTy) {
12297 default:
12298 // Most of the vector types have blends past SSE4.1.
12299 return Op;
12300
12301 case MVT::v32i8:
12302 // The byte blends for AVX vectors were introduced only in AVX2.
12303 if (Subtarget.hasAVX2())
12304 return Op;
12305
12306 return SDValue();
12307
12308 case MVT::v8i16:
12309 case MVT::v16i16:
12310 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
12311 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12312 return Op;
12313
12314 // FIXME: We should custom lower this by fixing the condition and using i8
12315 // blends.
12316 return SDValue();
12317 }
12318 }
12319
LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,SelectionDAG & DAG)12320 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12321 MVT VT = Op.getSimpleValueType();
12322 SDLoc dl(Op);
12323
12324 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12325 return SDValue();
12326
12327 if (VT.getSizeInBits() == 8) {
12328 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12329 Op.getOperand(0), Op.getOperand(1));
12330 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12331 DAG.getValueType(VT));
12332 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12333 }
12334
12335 if (VT.getSizeInBits() == 16) {
12336 // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12337 if (isNullConstant(Op.getOperand(1)))
12338 return DAG.getNode(
12339 ISD::TRUNCATE, dl, MVT::i16,
12340 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12341 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12342 Op.getOperand(1)));
12343 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12344 Op.getOperand(0), Op.getOperand(1));
12345 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12346 DAG.getValueType(VT));
12347 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12348 }
12349
12350 if (VT == MVT::f32) {
12351 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12352 // the result back to FR32 register. It's only worth matching if the
12353 // result has a single use which is a store or a bitcast to i32. And in
12354 // the case of a store, it's not worth it if the index is a constant 0,
12355 // because a MOVSSmr can be used instead, which is smaller and faster.
12356 if (!Op.hasOneUse())
12357 return SDValue();
12358 SDNode *User = *Op.getNode()->use_begin();
12359 if ((User->getOpcode() != ISD::STORE ||
12360 isNullConstant(Op.getOperand(1))) &&
12361 (User->getOpcode() != ISD::BITCAST ||
12362 User->getValueType(0) != MVT::i32))
12363 return SDValue();
12364 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12365 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12366 Op.getOperand(1));
12367 return DAG.getBitcast(MVT::f32, Extract);
12368 }
12369
12370 if (VT == MVT::i32 || VT == MVT::i64) {
12371 // ExtractPS/pextrq works with constant index.
12372 if (isa<ConstantSDNode>(Op.getOperand(1)))
12373 return Op;
12374 }
12375 return SDValue();
12376 }
12377
12378 /// Extract one bit from mask vector, like v16i1 or v8i1.
12379 /// AVX-512 feature.
12380 SDValue
ExtractBitFromMaskVector(SDValue Op,SelectionDAG & DAG) const12381 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12382 SDValue Vec = Op.getOperand(0);
12383 SDLoc dl(Vec);
12384 MVT VecVT = Vec.getSimpleValueType();
12385 SDValue Idx = Op.getOperand(1);
12386 MVT EltVT = Op.getSimpleValueType();
12387
12388 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12389 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
12390 "Unexpected vector type in ExtractBitFromMaskVector");
12391
12392 // variable index can't be handled in mask registers,
12393 // extend vector to VR512
12394 if (!isa<ConstantSDNode>(Idx)) {
12395 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
12396 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12397 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12398 ExtVT.getVectorElementType(), Ext, Idx);
12399 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12400 }
12401
12402 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12403 if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
12404 // Use kshiftlw/rw instruction.
12405 VecVT = MVT::v16i1;
12406 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
12407 DAG.getUNDEF(VecVT),
12408 Vec,
12409 DAG.getIntPtrConstant(0, dl));
12410 }
12411 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
12412 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12413 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
12414 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12415 DAG.getConstant(MaxSift, dl, MVT::i8));
12416 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12417 DAG.getIntPtrConstant(0, dl));
12418 }
12419
12420 SDValue
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const12421 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12422 SelectionDAG &DAG) const {
12423 SDLoc dl(Op);
12424 SDValue Vec = Op.getOperand(0);
12425 MVT VecVT = Vec.getSimpleValueType();
12426 SDValue Idx = Op.getOperand(1);
12427
12428 if (Op.getSimpleValueType() == MVT::i1)
12429 return ExtractBitFromMaskVector(Op, DAG);
12430
12431 if (!isa<ConstantSDNode>(Idx)) {
12432 if (VecVT.is512BitVector() ||
12433 (VecVT.is256BitVector() && Subtarget.hasInt256() &&
12434 VecVT.getVectorElementType().getSizeInBits() == 32)) {
12435
12436 MVT MaskEltVT =
12437 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12438 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12439 MaskEltVT.getSizeInBits());
12440
12441 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12442 auto PtrVT = getPointerTy(DAG.getDataLayout());
12443 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12444 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
12445 DAG.getConstant(0, dl, PtrVT));
12446 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12447 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
12448 DAG.getConstant(0, dl, PtrVT));
12449 }
12450 return SDValue();
12451 }
12452
12453 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12454
12455 // If this is a 256-bit vector result, first extract the 128-bit vector and
12456 // then extract the element from the 128-bit vector.
12457 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12458 // Get the 128-bit vector.
12459 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
12460 MVT EltVT = VecVT.getVectorElementType();
12461
12462 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12463 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
12464
12465 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
12466 // this can be done with a mask.
12467 IdxVal &= ElemsPerChunk - 1;
12468 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12469 DAG.getConstant(IdxVal, dl, MVT::i32));
12470 }
12471
12472 assert(VecVT.is128BitVector() && "Unexpected vector length");
12473
12474 if (Subtarget.hasSSE41())
12475 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
12476 return Res;
12477
12478 MVT VT = Op.getSimpleValueType();
12479 // TODO: handle v16i8.
12480 if (VT.getSizeInBits() == 16) {
12481 if (IdxVal == 0)
12482 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12483 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12484 DAG.getBitcast(MVT::v4i32, Vec), Idx));
12485
12486 // Transform it so it match pextrw which produces a 32-bit result.
12487 MVT EltVT = MVT::i32;
12488 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
12489 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12490 DAG.getValueType(VT));
12491 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12492 }
12493
12494 if (VT.getSizeInBits() == 32) {
12495 if (IdxVal == 0)
12496 return Op;
12497
12498 // SHUFPS the element to the lowest double word, then movss.
12499 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
12500 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12501 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12502 DAG.getIntPtrConstant(0, dl));
12503 }
12504
12505 if (VT.getSizeInBits() == 64) {
12506 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12507 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12508 // to match extract_elt for f64.
12509 if (IdxVal == 0)
12510 return Op;
12511
12512 // UNPCKHPD the element to the lowest double word, then movsd.
12513 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12514 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12515 int Mask[2] = { 1, -1 };
12516 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12517 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12518 DAG.getIntPtrConstant(0, dl));
12519 }
12520
12521 return SDValue();
12522 }
12523
12524 /// Insert one bit to mask vector, like v16i1 or v8i1.
12525 /// AVX-512 feature.
12526 SDValue
InsertBitToMaskVector(SDValue Op,SelectionDAG & DAG) const12527 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12528 SDLoc dl(Op);
12529 SDValue Vec = Op.getOperand(0);
12530 SDValue Elt = Op.getOperand(1);
12531 SDValue Idx = Op.getOperand(2);
12532 MVT VecVT = Vec.getSimpleValueType();
12533
12534 if (!isa<ConstantSDNode>(Idx)) {
12535 // Non constant index. Extend source and destination,
12536 // insert element and then truncate the result.
12537 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
12538 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
12539 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
12540 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
12541 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
12542 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
12543 }
12544
12545 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12546 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
12547 if (IdxVal)
12548 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12549 DAG.getConstant(IdxVal, dl, MVT::i8));
12550 if (Vec.isUndef())
12551 return EltInVec;
12552 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
12553 }
12554
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const12555 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12556 SelectionDAG &DAG) const {
12557 MVT VT = Op.getSimpleValueType();
12558 MVT EltVT = VT.getVectorElementType();
12559 unsigned NumElts = VT.getVectorNumElements();
12560
12561 if (EltVT == MVT::i1)
12562 return InsertBitToMaskVector(Op, DAG);
12563
12564 SDLoc dl(Op);
12565 SDValue N0 = Op.getOperand(0);
12566 SDValue N1 = Op.getOperand(1);
12567 SDValue N2 = Op.getOperand(2);
12568 if (!isa<ConstantSDNode>(N2))
12569 return SDValue();
12570 auto *N2C = cast<ConstantSDNode>(N2);
12571 unsigned IdxVal = N2C->getZExtValue();
12572
12573 // If we are clearing out a element, we do this more efficiently with a
12574 // blend shuffle than a costly integer insertion.
12575 // TODO: would other rematerializable values (e.g. allbits) benefit as well?
12576 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
12577 // be beneficial if we are inserting several zeros and can combine the masks.
12578 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
12579 SmallVector<int, 8> ClearMask;
12580 for (unsigned i = 0; i != NumElts; ++i)
12581 ClearMask.push_back(i == IdxVal ? i + NumElts : i);
12582 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
12583 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
12584 }
12585
12586 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
12587 // into that, and then insert the subvector back into the result.
12588 if (VT.is256BitVector() || VT.is512BitVector()) {
12589 // With a 256-bit vector, we can insert into the zero element efficiently
12590 // using a blend if we have AVX or AVX2 and the right data type.
12591 if (VT.is256BitVector() && IdxVal == 0) {
12592 // TODO: It is worthwhile to cast integer to floating point and back
12593 // and incur a domain crossing penalty if that's what we'll end up
12594 // doing anyway after extracting to a 128-bit vector.
12595 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12596 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
12597 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
12598 N2 = DAG.getIntPtrConstant(1, dl);
12599 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
12600 }
12601 }
12602
12603 // Get the desired 128-bit vector chunk.
12604 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
12605
12606 // Insert the element into the desired chunk.
12607 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
12608 assert(isPowerOf2_32(NumEltsIn128));
12609 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
12610 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
12611
12612 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
12613 DAG.getConstant(IdxIn128, dl, MVT::i32));
12614
12615 // Insert the changed part back into the bigger vector
12616 return insert128BitVector(N0, V, IdxVal, DAG, dl);
12617 }
12618 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
12619
12620 if (Subtarget.hasSSE41()) {
12621 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
12622 unsigned Opc;
12623 if (VT == MVT::v8i16) {
12624 Opc = X86ISD::PINSRW;
12625 } else {
12626 assert(VT == MVT::v16i8);
12627 Opc = X86ISD::PINSRB;
12628 }
12629
12630 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
12631 // argument.
12632 if (N1.getValueType() != MVT::i32)
12633 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12634 if (N2.getValueType() != MVT::i32)
12635 N2 = DAG.getIntPtrConstant(IdxVal, dl);
12636 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
12637 }
12638
12639 if (EltVT == MVT::f32) {
12640 // Bits [7:6] of the constant are the source select. This will always be
12641 // zero here. The DAG Combiner may combine an extract_elt index into
12642 // these bits. For example (insert (extract, 3), 2) could be matched by
12643 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
12644 // Bits [5:4] of the constant are the destination select. This is the
12645 // value of the incoming immediate.
12646 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
12647 // combine either bitwise AND or insert of float 0.0 to set these bits.
12648
12649 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
12650 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
12651 // If this is an insertion of 32-bits into the low 32-bits of
12652 // a vector, we prefer to generate a blend with immediate rather
12653 // than an insertps. Blends are simpler operations in hardware and so
12654 // will always have equal or better performance than insertps.
12655 // But if optimizing for size and there's a load folding opportunity,
12656 // generate insertps because blendps does not have a 32-bit memory
12657 // operand form.
12658 N2 = DAG.getIntPtrConstant(1, dl);
12659 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12660 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
12661 }
12662 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
12663 // Create this as a scalar to vector..
12664 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12665 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
12666 }
12667
12668 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
12669 // PINSR* works with constant index.
12670 return Op;
12671 }
12672 }
12673
12674 if (EltVT == MVT::i8)
12675 return SDValue();
12676
12677 if (EltVT.getSizeInBits() == 16) {
12678 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
12679 // as its second argument.
12680 if (N1.getValueType() != MVT::i32)
12681 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12682 if (N2.getValueType() != MVT::i32)
12683 N2 = DAG.getIntPtrConstant(IdxVal, dl);
12684 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
12685 }
12686 return SDValue();
12687 }
12688
LowerSCALAR_TO_VECTOR(SDValue Op,SelectionDAG & DAG)12689 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
12690 SDLoc dl(Op);
12691 MVT OpVT = Op.getSimpleValueType();
12692
12693 // If this is a 256-bit vector result, first insert into a 128-bit
12694 // vector and then insert into the 256-bit vector.
12695 if (!OpVT.is128BitVector()) {
12696 // Insert into a 128-bit vector.
12697 unsigned SizeFactor = OpVT.getSizeInBits()/128;
12698 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
12699 OpVT.getVectorNumElements() / SizeFactor);
12700
12701 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
12702
12703 // Insert the 128-bit vector.
12704 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
12705 }
12706
12707 if (OpVT == MVT::v1i64 &&
12708 Op.getOperand(0).getValueType() == MVT::i64)
12709 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
12710
12711 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
12712 assert(OpVT.is128BitVector() && "Expected an SSE type!");
12713 return DAG.getBitcast(
12714 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
12715 }
12716
12717 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
12718 // a simple subregister reference or explicit instructions to grab
12719 // upper bits of a vector.
LowerEXTRACT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12720 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12721 SelectionDAG &DAG) {
12722 SDLoc dl(Op);
12723 SDValue In = Op.getOperand(0);
12724 SDValue Idx = Op.getOperand(1);
12725 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12726 MVT ResVT = Op.getSimpleValueType();
12727 MVT InVT = In.getSimpleValueType();
12728
12729 if (Subtarget.hasFp256()) {
12730 if (ResVT.is128BitVector() &&
12731 (InVT.is256BitVector() || InVT.is512BitVector()) &&
12732 isa<ConstantSDNode>(Idx)) {
12733 return extract128BitVector(In, IdxVal, DAG, dl);
12734 }
12735 if (ResVT.is256BitVector() && InVT.is512BitVector() &&
12736 isa<ConstantSDNode>(Idx)) {
12737 return extract256BitVector(In, IdxVal, DAG, dl);
12738 }
12739 }
12740 return SDValue();
12741 }
12742
12743 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
12744 // simple superregister reference or explicit instructions to insert
12745 // the upper bits of a vector.
LowerINSERT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12746 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12747 SelectionDAG &DAG) {
12748 if (!Subtarget.hasAVX())
12749 return SDValue();
12750
12751 SDLoc dl(Op);
12752 SDValue Vec = Op.getOperand(0);
12753 SDValue SubVec = Op.getOperand(1);
12754 SDValue Idx = Op.getOperand(2);
12755
12756 if (!isa<ConstantSDNode>(Idx))
12757 return SDValue();
12758
12759 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12760 MVT OpVT = Op.getSimpleValueType();
12761 MVT SubVecVT = SubVec.getSimpleValueType();
12762
12763 // Fold two 16-byte subvector loads into one 32-byte load:
12764 // (insert_subvector (insert_subvector undef, (load addr), 0),
12765 // (load addr + 16), Elts/2)
12766 // --> load32 addr
12767 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
12768 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
12769 OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
12770 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
12771 if (Idx2 && Idx2->getZExtValue() == 0) {
12772 // If needed, look through bitcasts to get to the load.
12773 SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
12774 if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
12775 bool Fast;
12776 unsigned Alignment = FirstLd->getAlignment();
12777 unsigned AS = FirstLd->getAddressSpace();
12778 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
12779 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
12780 OpVT, AS, Alignment, &Fast) && Fast) {
12781 SDValue Ops[] = { SubVec2, SubVec };
12782 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
12783 return Ld;
12784 }
12785 }
12786 }
12787 }
12788
12789 if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
12790 SubVecVT.is128BitVector())
12791 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
12792
12793 if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
12794 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
12795
12796 if (OpVT.getVectorElementType() == MVT::i1)
12797 return insert1BitVector(Op, DAG, Subtarget);
12798
12799 return SDValue();
12800 }
12801
12802 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
12803 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
12804 // one of the above mentioned nodes. It has to be wrapped because otherwise
12805 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
12806 // be used to form addressing mode. These wrapped nodes will be selected
12807 // into MOV32ri.
12808 SDValue
LowerConstantPool(SDValue Op,SelectionDAG & DAG) const12809 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
12810 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12811
12812 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12813 // global base reg.
12814 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12815 unsigned WrapperKind = X86ISD::Wrapper;
12816 CodeModel::Model M = DAG.getTarget().getCodeModel();
12817
12818 if (Subtarget.isPICStyleRIPRel() &&
12819 (M == CodeModel::Small || M == CodeModel::Kernel))
12820 WrapperKind = X86ISD::WrapperRIP;
12821
12822 auto PtrVT = getPointerTy(DAG.getDataLayout());
12823 SDValue Result = DAG.getTargetConstantPool(
12824 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
12825 SDLoc DL(CP);
12826 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12827 // With PIC, the address is actually $g + Offset.
12828 if (OpFlag) {
12829 Result =
12830 DAG.getNode(ISD::ADD, DL, PtrVT,
12831 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12832 }
12833
12834 return Result;
12835 }
12836
LowerJumpTable(SDValue Op,SelectionDAG & DAG) const12837 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
12838 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12839
12840 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12841 // global base reg.
12842 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12843 unsigned WrapperKind = X86ISD::Wrapper;
12844 CodeModel::Model M = DAG.getTarget().getCodeModel();
12845
12846 if (Subtarget.isPICStyleRIPRel() &&
12847 (M == CodeModel::Small || M == CodeModel::Kernel))
12848 WrapperKind = X86ISD::WrapperRIP;
12849
12850 auto PtrVT = getPointerTy(DAG.getDataLayout());
12851 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
12852 SDLoc DL(JT);
12853 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12854
12855 // With PIC, the address is actually $g + Offset.
12856 if (OpFlag)
12857 Result =
12858 DAG.getNode(ISD::ADD, DL, PtrVT,
12859 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12860
12861 return Result;
12862 }
12863
12864 SDValue
LowerExternalSymbol(SDValue Op,SelectionDAG & DAG) const12865 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
12866 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
12867
12868 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12869 // global base reg.
12870 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
12871 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
12872 unsigned WrapperKind = X86ISD::Wrapper;
12873 CodeModel::Model M = DAG.getTarget().getCodeModel();
12874
12875 if (Subtarget.isPICStyleRIPRel() &&
12876 (M == CodeModel::Small || M == CodeModel::Kernel))
12877 WrapperKind = X86ISD::WrapperRIP;
12878
12879 auto PtrVT = getPointerTy(DAG.getDataLayout());
12880 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
12881
12882 SDLoc DL(Op);
12883 Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12884
12885 // With PIC, the address is actually $g + Offset.
12886 if (isPositionIndependent() && !Subtarget.is64Bit()) {
12887 Result =
12888 DAG.getNode(ISD::ADD, DL, PtrVT,
12889 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12890 }
12891
12892 // For symbols that require a load from a stub to get the address, emit the
12893 // load.
12894 if (isGlobalStubReference(OpFlag))
12895 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
12896 MachinePointerInfo::getGOT(DAG.getMachineFunction()),
12897 false, false, false, 0);
12898
12899 return Result;
12900 }
12901
12902 SDValue
LowerBlockAddress(SDValue Op,SelectionDAG & DAG) const12903 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
12904 // Create the TargetBlockAddressAddress node.
12905 unsigned char OpFlags =
12906 Subtarget.classifyBlockAddressReference();
12907 CodeModel::Model M = DAG.getTarget().getCodeModel();
12908 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
12909 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
12910 SDLoc dl(Op);
12911 auto PtrVT = getPointerTy(DAG.getDataLayout());
12912 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
12913
12914 if (Subtarget.isPICStyleRIPRel() &&
12915 (M == CodeModel::Small || M == CodeModel::Kernel))
12916 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12917 else
12918 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12919
12920 // With PIC, the address is actually $g + Offset.
12921 if (isGlobalRelativeToPICBase(OpFlags)) {
12922 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12923 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12924 }
12925
12926 return Result;
12927 }
12928
LowerGlobalAddress(const GlobalValue * GV,const SDLoc & dl,int64_t Offset,SelectionDAG & DAG) const12929 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
12930 const SDLoc &dl, int64_t Offset,
12931 SelectionDAG &DAG) const {
12932 // Create the TargetGlobalAddress node, folding in the constant
12933 // offset if it is legal.
12934 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
12935 CodeModel::Model M = DAG.getTarget().getCodeModel();
12936 auto PtrVT = getPointerTy(DAG.getDataLayout());
12937 SDValue Result;
12938 if (OpFlags == X86II::MO_NO_FLAG &&
12939 X86::isOffsetSuitableForCodeModel(Offset, M)) {
12940 // A direct static reference to a global.
12941 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
12942 Offset = 0;
12943 } else {
12944 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
12945 }
12946
12947 if (Subtarget.isPICStyleRIPRel() &&
12948 (M == CodeModel::Small || M == CodeModel::Kernel))
12949 Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12950 else
12951 Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12952
12953 // With PIC, the address is actually $g + Offset.
12954 if (isGlobalRelativeToPICBase(OpFlags)) {
12955 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12956 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12957 }
12958
12959 // For globals that require a load from a stub to get the address, emit the
12960 // load.
12961 if (isGlobalStubReference(OpFlags))
12962 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
12963 MachinePointerInfo::getGOT(DAG.getMachineFunction()),
12964 false, false, false, 0);
12965
12966 // If there was a non-zero offset that we didn't fold, create an explicit
12967 // addition for it.
12968 if (Offset != 0)
12969 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
12970 DAG.getConstant(Offset, dl, PtrVT));
12971
12972 return Result;
12973 }
12974
12975 SDValue
LowerGlobalAddress(SDValue Op,SelectionDAG & DAG) const12976 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
12977 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
12978 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
12979 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
12980 }
12981
12982 static SDValue
GetTLSADDR(SelectionDAG & DAG,SDValue Chain,GlobalAddressSDNode * GA,SDValue * InFlag,const EVT PtrVT,unsigned ReturnReg,unsigned char OperandFlags,bool LocalDynamic=false)12983 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
12984 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
12985 unsigned char OperandFlags, bool LocalDynamic = false) {
12986 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
12987 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
12988 SDLoc dl(GA);
12989 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
12990 GA->getValueType(0),
12991 GA->getOffset(),
12992 OperandFlags);
12993
12994 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
12995 : X86ISD::TLSADDR;
12996
12997 if (InFlag) {
12998 SDValue Ops[] = { Chain, TGA, *InFlag };
12999 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13000 } else {
13001 SDValue Ops[] = { Chain, TGA };
13002 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13003 }
13004
13005 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13006 MFI->setAdjustsStack(true);
13007 MFI->setHasCalls(true);
13008
13009 SDValue Flag = Chain.getValue(1);
13010 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13011 }
13012
13013 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13014 static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)13015 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13016 const EVT PtrVT) {
13017 SDValue InFlag;
13018 SDLoc dl(GA); // ? function entry point might be better
13019 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13020 DAG.getNode(X86ISD::GlobalBaseReg,
13021 SDLoc(), PtrVT), InFlag);
13022 InFlag = Chain.getValue(1);
13023
13024 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13025 }
13026
13027 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13028 static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)13029 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13030 const EVT PtrVT) {
13031 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13032 X86::RAX, X86II::MO_TLSGD);
13033 }
13034
LowerToTLSLocalDynamicModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,bool is64Bit)13035 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13036 SelectionDAG &DAG,
13037 const EVT PtrVT,
13038 bool is64Bit) {
13039 SDLoc dl(GA);
13040
13041 // Get the start address of the TLS block for this module.
13042 X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13043 .getInfo<X86MachineFunctionInfo>();
13044 MFI->incNumLocalDynamicTLSAccesses();
13045
13046 SDValue Base;
13047 if (is64Bit) {
13048 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13049 X86II::MO_TLSLD, /*LocalDynamic=*/true);
13050 } else {
13051 SDValue InFlag;
13052 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13053 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13054 InFlag = Chain.getValue(1);
13055 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13056 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13057 }
13058
13059 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13060 // of Base.
13061
13062 // Build x@dtpoff.
13063 unsigned char OperandFlags = X86II::MO_DTPOFF;
13064 unsigned WrapperKind = X86ISD::Wrapper;
13065 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13066 GA->getValueType(0),
13067 GA->getOffset(), OperandFlags);
13068 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13069
13070 // Add x@dtpoff with the base.
13071 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13072 }
13073
13074 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
LowerToTLSExecModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,TLSModel::Model model,bool is64Bit,bool isPIC)13075 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13076 const EVT PtrVT, TLSModel::Model model,
13077 bool is64Bit, bool isPIC) {
13078 SDLoc dl(GA);
13079
13080 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13081 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13082 is64Bit ? 257 : 256));
13083
13084 SDValue ThreadPointer =
13085 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
13086 MachinePointerInfo(Ptr), false, false, false, 0);
13087
13088 unsigned char OperandFlags = 0;
13089 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
13090 // initialexec.
13091 unsigned WrapperKind = X86ISD::Wrapper;
13092 if (model == TLSModel::LocalExec) {
13093 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13094 } else if (model == TLSModel::InitialExec) {
13095 if (is64Bit) {
13096 OperandFlags = X86II::MO_GOTTPOFF;
13097 WrapperKind = X86ISD::WrapperRIP;
13098 } else {
13099 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13100 }
13101 } else {
13102 llvm_unreachable("Unexpected model");
13103 }
13104
13105 // emit "addl x@ntpoff,%eax" (local exec)
13106 // or "addl x@indntpoff,%eax" (initial exec)
13107 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13108 SDValue TGA =
13109 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13110 GA->getOffset(), OperandFlags);
13111 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13112
13113 if (model == TLSModel::InitialExec) {
13114 if (isPIC && !is64Bit) {
13115 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13116 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13117 Offset);
13118 }
13119
13120 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13121 MachinePointerInfo::getGOT(DAG.getMachineFunction()),
13122 false, false, false, 0);
13123 }
13124
13125 // The address of the thread local variable is the add of the thread
13126 // pointer with the offset of the variable.
13127 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13128 }
13129
13130 SDValue
LowerGlobalTLSAddress(SDValue Op,SelectionDAG & DAG) const13131 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13132
13133 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13134
13135 if (DAG.getTarget().Options.EmulatedTLS)
13136 return LowerToTLSEmulatedModel(GA, DAG);
13137
13138 const GlobalValue *GV = GA->getGlobal();
13139 auto PtrVT = getPointerTy(DAG.getDataLayout());
13140 bool PositionIndependent = isPositionIndependent();
13141
13142 if (Subtarget.isTargetELF()) {
13143 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13144 switch (model) {
13145 case TLSModel::GeneralDynamic:
13146 if (Subtarget.is64Bit())
13147 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
13148 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
13149 case TLSModel::LocalDynamic:
13150 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
13151 Subtarget.is64Bit());
13152 case TLSModel::InitialExec:
13153 case TLSModel::LocalExec:
13154 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
13155 PositionIndependent);
13156 }
13157 llvm_unreachable("Unknown TLS model.");
13158 }
13159
13160 if (Subtarget.isTargetDarwin()) {
13161 // Darwin only has one model of TLS. Lower to that.
13162 unsigned char OpFlag = 0;
13163 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
13164 X86ISD::WrapperRIP : X86ISD::Wrapper;
13165
13166 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13167 // global base reg.
13168 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
13169 if (PIC32)
13170 OpFlag = X86II::MO_TLVP_PIC_BASE;
13171 else
13172 OpFlag = X86II::MO_TLVP;
13173 SDLoc DL(Op);
13174 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13175 GA->getValueType(0),
13176 GA->getOffset(), OpFlag);
13177 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
13178
13179 // With PIC32, the address is actually $g + Offset.
13180 if (PIC32)
13181 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
13182 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13183 Offset);
13184
13185 // Lowering the machine isd will make sure everything is in the right
13186 // location.
13187 SDValue Chain = DAG.getEntryNode();
13188 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13189 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
13190 SDValue Args[] = { Chain, Offset };
13191 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13192 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
13193 DAG.getIntPtrConstant(0, DL, true),
13194 Chain.getValue(1), DL);
13195
13196 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13197 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13198 MFI->setAdjustsStack(true);
13199
13200 // And our return value (tls address) is in the standard call return value
13201 // location.
13202 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
13203 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
13204 }
13205
13206 if (Subtarget.isTargetKnownWindowsMSVC() ||
13207 Subtarget.isTargetWindowsItanium() ||
13208 Subtarget.isTargetWindowsGNU()) {
13209 // Just use the implicit TLS architecture
13210 // Need to generate someting similar to:
13211 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13212 // ; from TEB
13213 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
13214 // mov rcx, qword [rdx+rcx*8]
13215 // mov eax, .tls$:tlsvar
13216 // [rax+rcx] contains the address
13217 // Windows 64bit: gs:0x58
13218 // Windows 32bit: fs:__tls_array
13219
13220 SDLoc dl(GA);
13221 SDValue Chain = DAG.getEntryNode();
13222
13223 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13224 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13225 // use its literal value of 0x2C.
13226 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
13227 ? Type::getInt8PtrTy(*DAG.getContext(),
13228 256)
13229 : Type::getInt32PtrTy(*DAG.getContext(),
13230 257));
13231
13232 SDValue TlsArray = Subtarget.is64Bit()
13233 ? DAG.getIntPtrConstant(0x58, dl)
13234 : (Subtarget.isTargetWindowsGNU()
13235 ? DAG.getIntPtrConstant(0x2C, dl)
13236 : DAG.getExternalSymbol("_tls_array", PtrVT));
13237
13238 SDValue ThreadPointer =
13239 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
13240 false, false, 0);
13241
13242 SDValue res;
13243 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
13244 res = ThreadPointer;
13245 } else {
13246 // Load the _tls_index variable
13247 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
13248 if (Subtarget.is64Bit())
13249 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
13250 MachinePointerInfo(), MVT::i32, false, false,
13251 false, 0);
13252 else
13253 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
13254 false, false, 0);
13255
13256 auto &DL = DAG.getDataLayout();
13257 SDValue Scale =
13258 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
13259 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
13260
13261 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
13262 }
13263
13264 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
13265 false, 0);
13266
13267 // Get the offset of start of .tls section
13268 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13269 GA->getValueType(0),
13270 GA->getOffset(), X86II::MO_SECREL);
13271 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
13272
13273 // The address of the thread local variable is the add of the thread
13274 // pointer with the offset of the variable.
13275 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
13276 }
13277
13278 llvm_unreachable("TLS not implemented for this target.");
13279 }
13280
13281 /// Lower SRA_PARTS and friends, which return two i32 values
13282 /// and take a 2 x i32 value to shift plus a shift amount.
LowerShiftParts(SDValue Op,SelectionDAG & DAG)13283 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13284 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13285 MVT VT = Op.getSimpleValueType();
13286 unsigned VTBits = VT.getSizeInBits();
13287 SDLoc dl(Op);
13288 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13289 SDValue ShOpLo = Op.getOperand(0);
13290 SDValue ShOpHi = Op.getOperand(1);
13291 SDValue ShAmt = Op.getOperand(2);
13292 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13293 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13294 // during isel.
13295 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13296 DAG.getConstant(VTBits - 1, dl, MVT::i8));
13297 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13298 DAG.getConstant(VTBits - 1, dl, MVT::i8))
13299 : DAG.getConstant(0, dl, VT);
13300
13301 SDValue Tmp2, Tmp3;
13302 if (Op.getOpcode() == ISD::SHL_PARTS) {
13303 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13304 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13305 } else {
13306 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13307 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13308 }
13309
13310 // If the shift amount is larger or equal than the width of a part we can't
13311 // rely on the results of shld/shrd. Insert a test and select the appropriate
13312 // values for large shift amounts.
13313 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13314 DAG.getConstant(VTBits, dl, MVT::i8));
13315 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13316 AndNode, DAG.getConstant(0, dl, MVT::i8));
13317
13318 SDValue Hi, Lo;
13319 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
13320 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13321 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13322
13323 if (Op.getOpcode() == ISD::SHL_PARTS) {
13324 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13325 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13326 } else {
13327 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13328 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13329 }
13330
13331 SDValue Ops[2] = { Lo, Hi };
13332 return DAG.getMergeValues(Ops, dl);
13333 }
13334
LowerSINT_TO_FP(SDValue Op,SelectionDAG & DAG) const13335 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13336 SelectionDAG &DAG) const {
13337 SDValue Src = Op.getOperand(0);
13338 MVT SrcVT = Src.getSimpleValueType();
13339 MVT VT = Op.getSimpleValueType();
13340 SDLoc dl(Op);
13341
13342 if (SrcVT.isVector()) {
13343 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
13344 return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
13345 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
13346 DAG.getUNDEF(SrcVT)));
13347 }
13348 if (SrcVT.getVectorElementType() == MVT::i1) {
13349 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13350 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13351 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
13352 }
13353 return SDValue();
13354 }
13355
13356 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13357 "Unknown SINT_TO_FP to lower!");
13358
13359 // These are really Legal; return the operand so the caller accepts it as
13360 // Legal.
13361 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13362 return Op;
13363 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13364 Subtarget.is64Bit()) {
13365 return Op;
13366 }
13367
13368 SDValue ValueToStore = Op.getOperand(0);
13369 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13370 !Subtarget.is64Bit())
13371 // Bitcasting to f64 here allows us to do a single 64-bit store from
13372 // an SSE register, avoiding the store forwarding penalty that would come
13373 // with two 32-bit stores.
13374 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13375
13376 unsigned Size = SrcVT.getSizeInBits()/8;
13377 MachineFunction &MF = DAG.getMachineFunction();
13378 auto PtrVT = getPointerTy(MF.getDataLayout());
13379 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13380 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13381 SDValue Chain = DAG.getStore(
13382 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
13383 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
13384 false, 0);
13385 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13386 }
13387
BuildFILD(SDValue Op,EVT SrcVT,SDValue Chain,SDValue StackSlot,SelectionDAG & DAG) const13388 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13389 SDValue StackSlot,
13390 SelectionDAG &DAG) const {
13391 // Build the FILD
13392 SDLoc DL(Op);
13393 SDVTList Tys;
13394 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13395 if (useSSE)
13396 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13397 else
13398 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13399
13400 unsigned ByteSize = SrcVT.getSizeInBits()/8;
13401
13402 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13403 MachineMemOperand *MMO;
13404 if (FI) {
13405 int SSFI = FI->getIndex();
13406 MMO = DAG.getMachineFunction().getMachineMemOperand(
13407 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13408 MachineMemOperand::MOLoad, ByteSize, ByteSize);
13409 } else {
13410 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13411 StackSlot = StackSlot.getOperand(1);
13412 }
13413 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13414 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13415 X86ISD::FILD, DL,
13416 Tys, Ops, SrcVT, MMO);
13417
13418 if (useSSE) {
13419 Chain = Result.getValue(1);
13420 SDValue InFlag = Result.getValue(2);
13421
13422 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13423 // shouldn't be necessary except that RFP cannot be live across
13424 // multiple blocks. When stackifier is fixed, they can be uncoupled.
13425 MachineFunction &MF = DAG.getMachineFunction();
13426 unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13427 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13428 auto PtrVT = getPointerTy(MF.getDataLayout());
13429 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13430 Tys = DAG.getVTList(MVT::Other);
13431 SDValue Ops[] = {
13432 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13433 };
13434 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13435 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13436 MachineMemOperand::MOStore, SSFISize, SSFISize);
13437
13438 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13439 Ops, Op.getValueType(), MMO);
13440 Result = DAG.getLoad(
13441 Op.getValueType(), DL, Chain, StackSlot,
13442 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13443 false, false, false, 0);
13444 }
13445
13446 return Result;
13447 }
13448
13449 /// 64-bit unsigned integer to double expansion.
LowerUINT_TO_FP_i64(SDValue Op,SelectionDAG & DAG) const13450 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13451 SelectionDAG &DAG) const {
13452 // This algorithm is not obvious. Here it is what we're trying to output:
13453 /*
13454 movq %rax, %xmm0
13455 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13456 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13457 #ifdef __SSE3__
13458 haddpd %xmm0, %xmm0
13459 #else
13460 pshufd $0x4e, %xmm0, %xmm1
13461 addpd %xmm1, %xmm0
13462 #endif
13463 */
13464
13465 SDLoc dl(Op);
13466 LLVMContext *Context = DAG.getContext();
13467
13468 // Build some magic constants.
13469 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13470 Constant *C0 = ConstantDataVector::get(*Context, CV0);
13471 auto PtrVT = getPointerTy(DAG.getDataLayout());
13472 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
13473
13474 SmallVector<Constant*,2> CV1;
13475 CV1.push_back(
13476 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13477 APInt(64, 0x4330000000000000ULL))));
13478 CV1.push_back(
13479 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13480 APInt(64, 0x4530000000000000ULL))));
13481 Constant *C1 = ConstantVector::get(CV1);
13482 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
13483
13484 // Load the 64-bit value into an XMM register.
13485 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13486 Op.getOperand(0));
13487 SDValue CLod0 =
13488 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13489 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13490 false, false, false, 16);
13491 SDValue Unpck1 =
13492 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
13493
13494 SDValue CLod1 =
13495 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13496 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13497 false, false, false, 16);
13498 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
13499 // TODO: Are there any fast-math-flags to propagate here?
13500 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13501 SDValue Result;
13502
13503 if (Subtarget.hasSSE3()) {
13504 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13505 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13506 } else {
13507 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
13508 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13509 S2F, 0x4E, DAG);
13510 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13511 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
13512 }
13513
13514 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13515 DAG.getIntPtrConstant(0, dl));
13516 }
13517
13518 /// 32-bit unsigned integer to float expansion.
LowerUINT_TO_FP_i32(SDValue Op,SelectionDAG & DAG) const13519 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13520 SelectionDAG &DAG) const {
13521 SDLoc dl(Op);
13522 // FP constant to bias correct the final result.
13523 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
13524 MVT::f64);
13525
13526 // Load the 32-bit value into an XMM register.
13527 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13528 Op.getOperand(0));
13529
13530 // Zero out the upper parts of the register.
13531 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13532
13533 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13534 DAG.getBitcast(MVT::v2f64, Load),
13535 DAG.getIntPtrConstant(0, dl));
13536
13537 // Or the load with the bias.
13538 SDValue Or = DAG.getNode(
13539 ISD::OR, dl, MVT::v2i64,
13540 DAG.getBitcast(MVT::v2i64,
13541 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
13542 DAG.getBitcast(MVT::v2i64,
13543 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
13544 Or =
13545 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13546 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
13547
13548 // Subtract the bias.
13549 // TODO: Are there any fast-math-flags to propagate here?
13550 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13551
13552 // Handle final rounding.
13553 MVT DestVT = Op.getSimpleValueType();
13554
13555 if (DestVT.bitsLT(MVT::f64))
13556 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13557 DAG.getIntPtrConstant(0, dl));
13558 if (DestVT.bitsGT(MVT::f64))
13559 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13560
13561 // Handle final rounding.
13562 return Sub;
13563 }
13564
lowerUINT_TO_FP_vXi32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)13565 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13566 const X86Subtarget &Subtarget) {
13567 // The algorithm is the following:
13568 // #ifdef __SSE4_1__
13569 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13570 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13571 // (uint4) 0x53000000, 0xaa);
13572 // #else
13573 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13574 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
13575 // #endif
13576 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13577 // return (float4) lo + fhi;
13578
13579 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
13580 // reassociate the two FADDs, and if we do that, the algorithm fails
13581 // spectacularly (PR24512).
13582 // FIXME: If we ever have some kind of Machine FMF, this should be marked
13583 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
13584 // there's also the MachineCombiner reassociations happening on Machine IR.
13585 if (DAG.getTarget().Options.UnsafeFPMath)
13586 return SDValue();
13587
13588 SDLoc DL(Op);
13589 SDValue V = Op->getOperand(0);
13590 MVT VecIntVT = V.getSimpleValueType();
13591 bool Is128 = VecIntVT == MVT::v4i32;
13592 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13593 // If we convert to something else than the supported type, e.g., to v4f64,
13594 // abort early.
13595 if (VecFloatVT != Op->getSimpleValueType(0))
13596 return SDValue();
13597
13598 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13599 "Unsupported custom type");
13600
13601 // In the #idef/#else code, we have in common:
13602 // - The vector of constants:
13603 // -- 0x4b000000
13604 // -- 0x53000000
13605 // - A shift:
13606 // -- v >> 16
13607
13608 // Create the splat vector for 0x4b000000.
13609 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
13610 // Create the splat vector for 0x53000000.
13611 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
13612
13613 // Create the right shift.
13614 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
13615 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
13616
13617 SDValue Low, High;
13618 if (Subtarget.hasSSE41()) {
13619 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
13620 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13621 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
13622 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
13623 // Low will be bitcasted right away, so do not bother bitcasting back to its
13624 // original type.
13625 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
13626 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13627 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13628 // (uint4) 0x53000000, 0xaa);
13629 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
13630 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
13631 // High will be bitcasted right away, so do not bother bitcasting back to
13632 // its original type.
13633 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
13634 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13635 } else {
13636 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
13637 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13638 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
13639 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
13640
13641 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
13642 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
13643 }
13644
13645 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
13646 SDValue VecCstFAdd = DAG.getConstantFP(
13647 APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
13648
13649 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13650 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
13651 // TODO: Are there any fast-math-flags to propagate here?
13652 SDValue FHigh =
13653 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
13654 // return (float4) lo + fhi;
13655 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
13656 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
13657 }
13658
lowerUINT_TO_FP_vec(SDValue Op,SelectionDAG & DAG) const13659 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
13660 SelectionDAG &DAG) const {
13661 SDValue N0 = Op.getOperand(0);
13662 MVT SVT = N0.getSimpleValueType();
13663 SDLoc dl(Op);
13664
13665 switch (SVT.SimpleTy) {
13666 default:
13667 llvm_unreachable("Custom UINT_TO_FP is not supported!");
13668 case MVT::v4i8:
13669 case MVT::v4i16:
13670 case MVT::v8i8:
13671 case MVT::v8i16: {
13672 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
13673 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13674 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
13675 }
13676 case MVT::v4i32:
13677 case MVT::v8i32:
13678 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
13679 case MVT::v16i8:
13680 case MVT::v16i16:
13681 assert(Subtarget.hasAVX512());
13682 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
13683 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
13684 }
13685 }
13686
LowerUINT_TO_FP(SDValue Op,SelectionDAG & DAG) const13687 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
13688 SelectionDAG &DAG) const {
13689 SDValue N0 = Op.getOperand(0);
13690 SDLoc dl(Op);
13691 auto PtrVT = getPointerTy(DAG.getDataLayout());
13692
13693 if (Op.getSimpleValueType().isVector())
13694 return lowerUINT_TO_FP_vec(Op, DAG);
13695
13696 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
13697 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
13698 // the optimization here.
13699 if (DAG.SignBitIsZero(N0))
13700 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
13701
13702 MVT SrcVT = N0.getSimpleValueType();
13703 MVT DstVT = Op.getSimpleValueType();
13704
13705 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
13706 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
13707 // Conversions from unsigned i32 to f32/f64 are legal,
13708 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
13709 return Op;
13710 }
13711
13712 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
13713 return LowerUINT_TO_FP_i64(Op, DAG);
13714 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
13715 return LowerUINT_TO_FP_i32(Op, DAG);
13716 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
13717 return SDValue();
13718
13719 // Make a 64-bit buffer, and use it to build an FILD.
13720 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
13721 if (SrcVT == MVT::i32) {
13722 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
13723 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13724 StackSlot, MachinePointerInfo(),
13725 false, false, 0);
13726 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
13727 OffsetSlot, MachinePointerInfo(),
13728 false, false, 0);
13729 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
13730 return Fild;
13731 }
13732
13733 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
13734 SDValue ValueToStore = Op.getOperand(0);
13735 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
13736 // Bitcasting to f64 here allows us to do a single 64-bit store from
13737 // an SSE register, avoiding the store forwarding penalty that would come
13738 // with two 32-bit stores.
13739 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13740 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
13741 StackSlot, MachinePointerInfo(),
13742 false, false, 0);
13743 // For i64 source, we need to add the appropriate power of 2 if the input
13744 // was negative. This is the same as the optimization in
13745 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
13746 // we must be careful to do the computation in x87 extended precision, not
13747 // in SSE. (The generic code can't know it's OK to do this, or how to.)
13748 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
13749 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13750 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13751 MachineMemOperand::MOLoad, 8, 8);
13752
13753 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
13754 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
13755 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
13756 MVT::i64, MMO);
13757
13758 APInt FF(32, 0x5F800000ULL);
13759
13760 // Check whether the sign bit is set.
13761 SDValue SignSet = DAG.getSetCC(
13762 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
13763 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
13764
13765 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
13766 SDValue FudgePtr = DAG.getConstantPool(
13767 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
13768
13769 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
13770 SDValue Zero = DAG.getIntPtrConstant(0, dl);
13771 SDValue Four = DAG.getIntPtrConstant(4, dl);
13772 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
13773 Zero, Four);
13774 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
13775
13776 // Load the value out, extending it from f32 to f80.
13777 // FIXME: Avoid the extend by constructing the right constant pool?
13778 SDValue Fudge = DAG.getExtLoad(
13779 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
13780 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
13781 false, false, false, 4);
13782 // Extend everything to 80 bits to force it to be done on x87.
13783 // TODO: Are there any fast-math-flags to propagate here?
13784 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
13785 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
13786 DAG.getIntPtrConstant(0, dl));
13787 }
13788
13789 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
13790 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
13791 // just return an <SDValue(), SDValue()> pair.
13792 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
13793 // to i16, i32 or i64, and we lower it to a legal sequence.
13794 // If lowered to the final integer result we return a <result, SDValue()> pair.
13795 // Otherwise we lower it to a sequence ending with a FIST, return a
13796 // <FIST, StackSlot> pair, and the caller is responsible for loading
13797 // the final integer result from StackSlot.
13798 std::pair<SDValue,SDValue>
FP_TO_INTHelper(SDValue Op,SelectionDAG & DAG,bool IsSigned,bool IsReplace) const13799 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
13800 bool IsSigned, bool IsReplace) const {
13801 SDLoc DL(Op);
13802
13803 EVT DstTy = Op.getValueType();
13804 EVT TheVT = Op.getOperand(0).getValueType();
13805 auto PtrVT = getPointerTy(DAG.getDataLayout());
13806
13807 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
13808 // f16 must be promoted before using the lowering in this routine.
13809 // fp128 does not use this lowering.
13810 return std::make_pair(SDValue(), SDValue());
13811 }
13812
13813 // If using FIST to compute an unsigned i64, we'll need some fixup
13814 // to handle values above the maximum signed i64. A FIST is always
13815 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
13816 bool UnsignedFixup = !IsSigned &&
13817 DstTy == MVT::i64 &&
13818 (!Subtarget.is64Bit() ||
13819 !isScalarFPTypeInSSEReg(TheVT));
13820
13821 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
13822 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
13823 // The low 32 bits of the fist result will have the correct uint32 result.
13824 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
13825 DstTy = MVT::i64;
13826 }
13827
13828 assert(DstTy.getSimpleVT() <= MVT::i64 &&
13829 DstTy.getSimpleVT() >= MVT::i16 &&
13830 "Unknown FP_TO_INT to lower!");
13831
13832 // These are really Legal.
13833 if (DstTy == MVT::i32 &&
13834 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13835 return std::make_pair(SDValue(), SDValue());
13836 if (Subtarget.is64Bit() &&
13837 DstTy == MVT::i64 &&
13838 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13839 return std::make_pair(SDValue(), SDValue());
13840
13841 // We lower FP->int64 into FISTP64 followed by a load from a temporary
13842 // stack slot.
13843 MachineFunction &MF = DAG.getMachineFunction();
13844 unsigned MemSize = DstTy.getSizeInBits()/8;
13845 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13846 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13847
13848 unsigned Opc;
13849 switch (DstTy.getSimpleVT().SimpleTy) {
13850 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
13851 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
13852 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
13853 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
13854 }
13855
13856 SDValue Chain = DAG.getEntryNode();
13857 SDValue Value = Op.getOperand(0);
13858 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
13859
13860 if (UnsignedFixup) {
13861 //
13862 // Conversion to unsigned i64 is implemented with a select,
13863 // depending on whether the source value fits in the range
13864 // of a signed i64. Let Thresh be the FP equivalent of
13865 // 0x8000000000000000ULL.
13866 //
13867 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
13868 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
13869 // Fist-to-mem64 FistSrc
13870 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
13871 // to XOR'ing the high 32 bits with Adjust.
13872 //
13873 // Being a power of 2, Thresh is exactly representable in all FP formats.
13874 // For X87 we'd like to use the smallest FP type for this constant, but
13875 // for DAG type consistency we have to match the FP operand type.
13876
13877 APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
13878 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
13879 bool LosesInfo = false;
13880 if (TheVT == MVT::f64)
13881 // The rounding mode is irrelevant as the conversion should be exact.
13882 Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
13883 &LosesInfo);
13884 else if (TheVT == MVT::f80)
13885 Status = Thresh.convert(APFloat::x87DoubleExtended,
13886 APFloat::rmNearestTiesToEven, &LosesInfo);
13887
13888 assert(Status == APFloat::opOK && !LosesInfo &&
13889 "FP conversion should have been exact");
13890
13891 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
13892
13893 SDValue Cmp = DAG.getSetCC(DL,
13894 getSetCCResultType(DAG.getDataLayout(),
13895 *DAG.getContext(), TheVT),
13896 Value, ThreshVal, ISD::SETLT);
13897 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
13898 DAG.getConstant(0, DL, MVT::i32),
13899 DAG.getConstant(0x80000000, DL, MVT::i32));
13900 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
13901 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
13902 *DAG.getContext(), TheVT),
13903 Value, ThreshVal, ISD::SETLT);
13904 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
13905 }
13906
13907 // FIXME This causes a redundant load/store if the SSE-class value is already
13908 // in memory, such as if it is on the callstack.
13909 if (isScalarFPTypeInSSEReg(TheVT)) {
13910 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
13911 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
13912 MachinePointerInfo::getFixedStack(MF, SSFI), false,
13913 false, 0);
13914 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
13915 SDValue Ops[] = {
13916 Chain, StackSlot, DAG.getValueType(TheVT)
13917 };
13918
13919 MachineMemOperand *MMO =
13920 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13921 MachineMemOperand::MOLoad, MemSize, MemSize);
13922 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
13923 Chain = Value.getValue(1);
13924 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13925 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13926 }
13927
13928 MachineMemOperand *MMO =
13929 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13930 MachineMemOperand::MOStore, MemSize, MemSize);
13931
13932 if (UnsignedFixup) {
13933
13934 // Insert the FIST, load its result as two i32's,
13935 // and XOR the high i32 with Adjust.
13936
13937 SDValue FistOps[] = { Chain, Value, StackSlot };
13938 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13939 FistOps, DstTy, MMO);
13940
13941 SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
13942 MachinePointerInfo(),
13943 false, false, false, 0);
13944 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
13945
13946 SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
13947 MachinePointerInfo(),
13948 false, false, false, 0);
13949 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
13950
13951 if (Subtarget.is64Bit()) {
13952 // Join High32 and Low32 into a 64-bit result.
13953 // (High32 << 32) | Low32
13954 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
13955 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
13956 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
13957 DAG.getConstant(32, DL, MVT::i8));
13958 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
13959 return std::make_pair(Result, SDValue());
13960 }
13961
13962 SDValue ResultOps[] = { Low32, High32 };
13963
13964 SDValue pair = IsReplace
13965 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
13966 : DAG.getMergeValues(ResultOps, DL);
13967 return std::make_pair(pair, SDValue());
13968 } else {
13969 // Build the FP_TO_INT*_IN_MEM
13970 SDValue Ops[] = { Chain, Value, StackSlot };
13971 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13972 Ops, DstTy, MMO);
13973 return std::make_pair(FIST, StackSlot);
13974 }
13975 }
13976
LowerAVXExtend(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)13977 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
13978 const X86Subtarget &Subtarget) {
13979 MVT VT = Op->getSimpleValueType(0);
13980 SDValue In = Op->getOperand(0);
13981 MVT InVT = In.getSimpleValueType();
13982 SDLoc dl(Op);
13983
13984 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
13985 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
13986
13987 // Optimize vectors in AVX mode:
13988 //
13989 // v8i16 -> v8i32
13990 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
13991 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
13992 // Concat upper and lower parts.
13993 //
13994 // v4i32 -> v4i64
13995 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
13996 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
13997 // Concat upper and lower parts.
13998 //
13999
14000 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14001 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14002 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14003 return SDValue();
14004
14005 if (Subtarget.hasInt256())
14006 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14007
14008 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14009 SDValue Undef = DAG.getUNDEF(InVT);
14010 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14011 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14012 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14013
14014 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14015 VT.getVectorNumElements()/2);
14016
14017 OpLo = DAG.getBitcast(HVT, OpLo);
14018 OpHi = DAG.getBitcast(HVT, OpHi);
14019
14020 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14021 }
14022
LowerZERO_EXTEND_AVX512(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14023 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14024 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14025 MVT VT = Op->getSimpleValueType(0);
14026 SDValue In = Op->getOperand(0);
14027 MVT InVT = In.getSimpleValueType();
14028 SDLoc DL(Op);
14029 unsigned int NumElts = VT.getVectorNumElements();
14030 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
14031 return SDValue();
14032
14033 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14034 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14035
14036 assert(InVT.getVectorElementType() == MVT::i1);
14037
14038 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
14039 MVT ExtVT = VT;
14040 if (!VT.is512BitVector() && !Subtarget.hasVLX())
14041 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14042
14043 SDValue One =
14044 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
14045 SDValue Zero =
14046 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
14047
14048 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
14049 if (VT == ExtVT)
14050 return SelectedVal;
14051 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
14052 }
14053
LowerANY_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14054 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14055 SelectionDAG &DAG) {
14056 if (Subtarget.hasFp256())
14057 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14058 return Res;
14059
14060 return SDValue();
14061 }
14062
LowerZERO_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14063 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14064 SelectionDAG &DAG) {
14065 SDLoc DL(Op);
14066 MVT VT = Op.getSimpleValueType();
14067 SDValue In = Op.getOperand(0);
14068 MVT SVT = In.getSimpleValueType();
14069
14070 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14071 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
14072
14073 if (Subtarget.hasFp256())
14074 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14075 return Res;
14076
14077 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14078 VT.getVectorNumElements() != SVT.getVectorNumElements());
14079 return SDValue();
14080 }
14081
LowerTruncateVecI1(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)14082 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
14083 const X86Subtarget &Subtarget) {
14084
14085 SDLoc DL(Op);
14086 MVT VT = Op.getSimpleValueType();
14087 SDValue In = Op.getOperand(0);
14088 MVT InVT = In.getSimpleValueType();
14089
14090 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
14091
14092 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
14093 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
14094 if (InVT.getScalarSizeInBits() <= 16) {
14095 if (Subtarget.hasBWI()) {
14096 // legal, will go to VPMOVB2M, VPMOVW2M
14097 // Shift packed bytes not supported natively, bitcast to word
14098 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
14099 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
14100 DAG.getBitcast(ExtVT, In),
14101 DAG.getConstant(ShiftInx, DL, ExtVT));
14102 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
14103 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
14104 }
14105 // Use TESTD/Q, extended vector to packed dword/qword.
14106 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
14107 "Unexpected vector type.");
14108 unsigned NumElts = InVT.getVectorNumElements();
14109 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14110 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14111 InVT = ExtVT;
14112 ShiftInx = InVT.getScalarSizeInBits() - 1;
14113 }
14114
14115 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
14116 DAG.getConstant(ShiftInx, DL, InVT));
14117 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
14118 }
14119
LowerTRUNCATE(SDValue Op,SelectionDAG & DAG) const14120 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14121 SDLoc DL(Op);
14122 MVT VT = Op.getSimpleValueType();
14123 SDValue In = Op.getOperand(0);
14124 MVT InVT = In.getSimpleValueType();
14125
14126 if (VT == MVT::i1) {
14127 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14128 "Invalid scalar TRUNCATE operation");
14129 if (InVT.getSizeInBits() >= 32)
14130 return SDValue();
14131 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14132 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14133 }
14134 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14135 "Invalid TRUNCATE operation");
14136
14137 if (VT.getVectorElementType() == MVT::i1)
14138 return LowerTruncateVecI1(Op, DAG, Subtarget);
14139
14140 // vpmovqb/w/d, vpmovdb/w, vpmovwb
14141 if (Subtarget.hasAVX512()) {
14142 // word to byte only under BWI
14143 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
14144 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
14145 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
14146 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14147 }
14148 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14149 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14150 if (Subtarget.hasInt256()) {
14151 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14152 In = DAG.getBitcast(MVT::v8i32, In);
14153 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14154 ShufMask);
14155 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14156 DAG.getIntPtrConstant(0, DL));
14157 }
14158
14159 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14160 DAG.getIntPtrConstant(0, DL));
14161 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14162 DAG.getIntPtrConstant(2, DL));
14163 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14164 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14165 static const int ShufMask[] = {0, 2, 4, 6};
14166 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14167 }
14168
14169 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14170 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14171 if (Subtarget.hasInt256()) {
14172 In = DAG.getBitcast(MVT::v32i8, In);
14173
14174 SmallVector<SDValue,32> pshufbMask;
14175 for (unsigned i = 0; i < 2; ++i) {
14176 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
14177 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
14178 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
14179 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
14180 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
14181 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
14182 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
14183 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
14184 for (unsigned j = 0; j < 8; ++j)
14185 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
14186 }
14187 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
14188 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14189 In = DAG.getBitcast(MVT::v4i64, In);
14190
14191 static const int ShufMask[] = {0, 2, -1, -1};
14192 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
14193 ShufMask);
14194 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14195 DAG.getIntPtrConstant(0, DL));
14196 return DAG.getBitcast(VT, In);
14197 }
14198
14199 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14200 DAG.getIntPtrConstant(0, DL));
14201
14202 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14203 DAG.getIntPtrConstant(4, DL));
14204
14205 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
14206 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
14207
14208 // The PSHUFB mask:
14209 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
14210 -1, -1, -1, -1, -1, -1, -1, -1};
14211
14212 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14213 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14214 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14215
14216 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14217 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14218
14219 // The MOVLHPS Mask:
14220 static const int ShufMask2[] = {0, 1, 4, 5};
14221 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14222 return DAG.getBitcast(MVT::v8i16, res);
14223 }
14224
14225 // Handle truncation of V256 to V128 using shuffles.
14226 if (!VT.is128BitVector() || !InVT.is256BitVector())
14227 return SDValue();
14228
14229 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
14230
14231 unsigned NumElems = VT.getVectorNumElements();
14232 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14233
14234 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14235 // Prepare truncation shuffle mask
14236 for (unsigned i = 0; i != NumElems; ++i)
14237 MaskVec[i] = i * 2;
14238 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
14239 DAG.getUNDEF(NVT), MaskVec);
14240 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14241 DAG.getIntPtrConstant(0, DL));
14242 }
14243
LowerFP_TO_SINT(SDValue Op,SelectionDAG & DAG) const14244 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14245 SelectionDAG &DAG) const {
14246 assert(!Op.getSimpleValueType().isVector());
14247
14248 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14249 /*IsSigned=*/ true, /*IsReplace=*/ false);
14250 SDValue FIST = Vals.first, StackSlot = Vals.second;
14251 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14252 if (!FIST.getNode())
14253 return Op;
14254
14255 if (StackSlot.getNode())
14256 // Load the result.
14257 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14258 FIST, StackSlot, MachinePointerInfo(),
14259 false, false, false, 0);
14260
14261 // The node is the result.
14262 return FIST;
14263 }
14264
LowerFP_TO_UINT(SDValue Op,SelectionDAG & DAG) const14265 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14266 SelectionDAG &DAG) const {
14267 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14268 /*IsSigned=*/ false, /*IsReplace=*/ false);
14269 SDValue FIST = Vals.first, StackSlot = Vals.second;
14270 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14271 if (!FIST.getNode())
14272 return Op;
14273
14274 if (StackSlot.getNode())
14275 // Load the result.
14276 return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14277 FIST, StackSlot, MachinePointerInfo(),
14278 false, false, false, 0);
14279
14280 // The node is the result.
14281 return FIST;
14282 }
14283
LowerFP_EXTEND(SDValue Op,SelectionDAG & DAG)14284 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14285 SDLoc DL(Op);
14286 MVT VT = Op.getSimpleValueType();
14287 SDValue In = Op.getOperand(0);
14288 MVT SVT = In.getSimpleValueType();
14289
14290 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14291
14292 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14293 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14294 In, DAG.getUNDEF(SVT)));
14295 }
14296
14297 /// The only differences between FABS and FNEG are the mask and the logic op.
14298 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
LowerFABSorFNEG(SDValue Op,SelectionDAG & DAG)14299 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14300 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14301 "Wrong opcode for lowering FABS or FNEG.");
14302
14303 bool IsFABS = (Op.getOpcode() == ISD::FABS);
14304
14305 // If this is a FABS and it has an FNEG user, bail out to fold the combination
14306 // into an FNABS. We'll lower the FABS after that if it is still in use.
14307 if (IsFABS)
14308 for (SDNode *User : Op->uses())
14309 if (User->getOpcode() == ISD::FNEG)
14310 return Op;
14311
14312 SDLoc dl(Op);
14313 MVT VT = Op.getSimpleValueType();
14314
14315 bool IsF128 = (VT == MVT::f128);
14316
14317 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14318 // decide if we should generate a 16-byte constant mask when we only need 4 or
14319 // 8 bytes for the scalar case.
14320
14321 MVT LogicVT;
14322 MVT EltVT;
14323 unsigned NumElts;
14324
14325 if (VT.isVector()) {
14326 LogicVT = VT;
14327 EltVT = VT.getVectorElementType();
14328 NumElts = VT.getVectorNumElements();
14329 } else if (IsF128) {
14330 // SSE instructions are used for optimized f128 logical operations.
14331 LogicVT = MVT::f128;
14332 EltVT = VT;
14333 NumElts = 1;
14334 } else {
14335 // There are no scalar bitwise logical SSE/AVX instructions, so we
14336 // generate a 16-byte vector constant and logic op even for the scalar case.
14337 // Using a 16-byte mask allows folding the load of the mask with
14338 // the logic op, so it can save (~4 bytes) on code size.
14339 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
14340 EltVT = VT;
14341 NumElts = (VT == MVT::f64) ? 2 : 4;
14342 }
14343
14344 unsigned EltBits = EltVT.getSizeInBits();
14345 LLVMContext *Context = DAG.getContext();
14346 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14347 APInt MaskElt =
14348 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14349 Constant *C = ConstantInt::get(*Context, MaskElt);
14350 C = ConstantVector::getSplat(NumElts, C);
14351 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14352 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
14353 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14354 SDValue Mask =
14355 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14356 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14357 false, false, false, Alignment);
14358
14359 SDValue Op0 = Op.getOperand(0);
14360 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14361 unsigned LogicOp =
14362 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14363 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14364
14365 if (VT.isVector() || IsF128)
14366 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14367
14368 // For the scalar case extend to a 128-bit vector, perform the logic op,
14369 // and extract the scalar result back out.
14370 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
14371 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14372 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
14373 DAG.getIntPtrConstant(0, dl));
14374 }
14375
LowerFCOPYSIGN(SDValue Op,SelectionDAG & DAG)14376 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14377 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14378 LLVMContext *Context = DAG.getContext();
14379 SDValue Op0 = Op.getOperand(0);
14380 SDValue Op1 = Op.getOperand(1);
14381 SDLoc dl(Op);
14382 MVT VT = Op.getSimpleValueType();
14383 MVT SrcVT = Op1.getSimpleValueType();
14384 bool IsF128 = (VT == MVT::f128);
14385
14386 // If second operand is smaller, extend it first.
14387 if (SrcVT.bitsLT(VT)) {
14388 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14389 SrcVT = VT;
14390 }
14391 // And if it is bigger, shrink it first.
14392 if (SrcVT.bitsGT(VT)) {
14393 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
14394 SrcVT = VT;
14395 }
14396
14397 // At this point the operands and the result should have the same
14398 // type, and that won't be f80 since that is not custom lowered.
14399 assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
14400 "Unexpected type in LowerFCOPYSIGN");
14401
14402 const fltSemantics &Sem =
14403 VT == MVT::f64 ? APFloat::IEEEdouble :
14404 (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
14405 const unsigned SizeInBits = VT.getSizeInBits();
14406
14407 SmallVector<Constant *, 4> CV(
14408 VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
14409 ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14410
14411 // First, clear all bits but the sign bit from the second operand (sign).
14412 CV[0] = ConstantFP::get(*Context,
14413 APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14414 Constant *C = ConstantVector::get(CV);
14415 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
14416 SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14417
14418 // Perform all logic operations as 16-byte vectors because there are no
14419 // scalar FP logic instructions in SSE. This allows load folding of the
14420 // constants into the logic instructions.
14421 MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
14422 SDValue Mask1 =
14423 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14424 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14425 false, false, false, 16);
14426 if (!IsF128)
14427 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
14428 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
14429
14430 // Next, clear the sign bit from the first operand (magnitude).
14431 // If it's a constant, we can clear it here.
14432 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14433 APFloat APF = Op0CN->getValueAPF();
14434 // If the magnitude is a positive zero, the sign bit alone is enough.
14435 if (APF.isPosZero())
14436 return IsF128 ? SignBit :
14437 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
14438 DAG.getIntPtrConstant(0, dl));
14439 APF.clearSign();
14440 CV[0] = ConstantFP::get(*Context, APF);
14441 } else {
14442 CV[0] = ConstantFP::get(
14443 *Context,
14444 APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14445 }
14446 C = ConstantVector::get(CV);
14447 CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14448 SDValue Val =
14449 DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14450 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14451 false, false, false, 16);
14452 // If the magnitude operand wasn't a constant, we need to AND out the sign.
14453 if (!isa<ConstantFPSDNode>(Op0)) {
14454 if (!IsF128)
14455 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
14456 Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
14457 }
14458 // OR the magnitude value with the sign bit.
14459 Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
14460 return IsF128 ? Val :
14461 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
14462 DAG.getIntPtrConstant(0, dl));
14463 }
14464
LowerFGETSIGN(SDValue Op,SelectionDAG & DAG)14465 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14466 SDValue N0 = Op.getOperand(0);
14467 SDLoc dl(Op);
14468 MVT VT = Op.getSimpleValueType();
14469
14470 MVT OpVT = N0.getSimpleValueType();
14471 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
14472 "Unexpected type for FGETSIGN");
14473
14474 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
14475 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
14476 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
14477 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
14478 Res = DAG.getZExtOrTrunc(Res, dl, VT);
14479 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
14480 return Res;
14481 }
14482
14483 // Check whether an OR'd tree is PTEST-able.
LowerVectorAllZeroTest(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14484 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
14485 SelectionDAG &DAG) {
14486 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14487
14488 if (!Subtarget.hasSSE41())
14489 return SDValue();
14490
14491 if (!Op->hasOneUse())
14492 return SDValue();
14493
14494 SDNode *N = Op.getNode();
14495 SDLoc DL(N);
14496
14497 SmallVector<SDValue, 8> Opnds;
14498 DenseMap<SDValue, unsigned> VecInMap;
14499 SmallVector<SDValue, 8> VecIns;
14500 EVT VT = MVT::Other;
14501
14502 // Recognize a special case where a vector is casted into wide integer to
14503 // test all 0s.
14504 Opnds.push_back(N->getOperand(0));
14505 Opnds.push_back(N->getOperand(1));
14506
14507 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14508 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14509 // BFS traverse all OR'd operands.
14510 if (I->getOpcode() == ISD::OR) {
14511 Opnds.push_back(I->getOperand(0));
14512 Opnds.push_back(I->getOperand(1));
14513 // Re-evaluate the number of nodes to be traversed.
14514 e += 2; // 2 more nodes (LHS and RHS) are pushed.
14515 continue;
14516 }
14517
14518 // Quit if a non-EXTRACT_VECTOR_ELT
14519 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14520 return SDValue();
14521
14522 // Quit if without a constant index.
14523 SDValue Idx = I->getOperand(1);
14524 if (!isa<ConstantSDNode>(Idx))
14525 return SDValue();
14526
14527 SDValue ExtractedFromVec = I->getOperand(0);
14528 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14529 if (M == VecInMap.end()) {
14530 VT = ExtractedFromVec.getValueType();
14531 // Quit if not 128/256-bit vector.
14532 if (!VT.is128BitVector() && !VT.is256BitVector())
14533 return SDValue();
14534 // Quit if not the same type.
14535 if (VecInMap.begin() != VecInMap.end() &&
14536 VT != VecInMap.begin()->first.getValueType())
14537 return SDValue();
14538 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14539 VecIns.push_back(ExtractedFromVec);
14540 }
14541 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14542 }
14543
14544 assert((VT.is128BitVector() || VT.is256BitVector()) &&
14545 "Not extracted from 128-/256-bit vector.");
14546
14547 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14548
14549 for (DenseMap<SDValue, unsigned>::const_iterator
14550 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14551 // Quit if not all elements are used.
14552 if (I->second != FullMask)
14553 return SDValue();
14554 }
14555
14556 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14557
14558 // Cast all vectors into TestVT for PTEST.
14559 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14560 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
14561
14562 // If more than one full vectors are evaluated, OR them first before PTEST.
14563 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14564 // Each iteration will OR 2 nodes and append the result until there is only
14565 // 1 node left, i.e. the final OR'd value of all vectors.
14566 SDValue LHS = VecIns[Slot];
14567 SDValue RHS = VecIns[Slot + 1];
14568 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14569 }
14570
14571 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14572 VecIns.back(), VecIns.back());
14573 }
14574
14575 /// \brief return true if \c Op has a use that doesn't just read flags.
hasNonFlagsUse(SDValue Op)14576 static bool hasNonFlagsUse(SDValue Op) {
14577 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14578 ++UI) {
14579 SDNode *User = *UI;
14580 unsigned UOpNo = UI.getOperandNo();
14581 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14582 // Look pass truncate.
14583 UOpNo = User->use_begin().getOperandNo();
14584 User = *User->use_begin();
14585 }
14586
14587 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14588 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14589 return true;
14590 }
14591 return false;
14592 }
14593
14594 // Emit KTEST instruction for bit vectors on AVX-512
EmitKTEST(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)14595 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
14596 const X86Subtarget &Subtarget) {
14597 if (Op.getOpcode() == ISD::BITCAST) {
14598 auto hasKTEST = [&](MVT VT) {
14599 unsigned SizeInBits = VT.getSizeInBits();
14600 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
14601 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
14602 };
14603 SDValue Op0 = Op.getOperand(0);
14604 MVT Op0VT = Op0.getValueType().getSimpleVT();
14605 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
14606 hasKTEST(Op0VT))
14607 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
14608 }
14609 return SDValue();
14610 }
14611
14612 /// Emit nodes that will be selected as "test Op0,Op0", or something
14613 /// equivalent.
EmitTest(SDValue Op,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG) const14614 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
14615 SelectionDAG &DAG) const {
14616 if (Op.getValueType() == MVT::i1) {
14617 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
14618 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
14619 DAG.getConstant(0, dl, MVT::i8));
14620 }
14621 // CF and OF aren't always set the way we want. Determine which
14622 // of these we need.
14623 bool NeedCF = false;
14624 bool NeedOF = false;
14625 switch (X86CC) {
14626 default: break;
14627 case X86::COND_A: case X86::COND_AE:
14628 case X86::COND_B: case X86::COND_BE:
14629 NeedCF = true;
14630 break;
14631 case X86::COND_G: case X86::COND_GE:
14632 case X86::COND_L: case X86::COND_LE:
14633 case X86::COND_O: case X86::COND_NO: {
14634 // Check if we really need to set the
14635 // Overflow flag. If NoSignedWrap is present
14636 // that is not actually needed.
14637 switch (Op->getOpcode()) {
14638 case ISD::ADD:
14639 case ISD::SUB:
14640 case ISD::MUL:
14641 case ISD::SHL: {
14642 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
14643 if (BinNode->Flags.hasNoSignedWrap())
14644 break;
14645 }
14646 default:
14647 NeedOF = true;
14648 break;
14649 }
14650 break;
14651 }
14652 }
14653 // See if we can use the EFLAGS value from the operand instead of
14654 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14655 // we prove that the arithmetic won't overflow, we can't use OF or CF.
14656 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14657 // Emit KTEST for bit vectors
14658 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14659 return Node;
14660 // Emit a CMP with 0, which is the TEST pattern.
14661 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14662 DAG.getConstant(0, dl, Op.getValueType()));
14663 }
14664 unsigned Opcode = 0;
14665 unsigned NumOperands = 0;
14666
14667 // Truncate operations may prevent the merge of the SETCC instruction
14668 // and the arithmetic instruction before it. Attempt to truncate the operands
14669 // of the arithmetic instruction and use a reduced bit-width instruction.
14670 bool NeedTruncation = false;
14671 SDValue ArithOp = Op;
14672 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14673 SDValue Arith = Op->getOperand(0);
14674 // Both the trunc and the arithmetic op need to have one user each.
14675 if (Arith->hasOneUse())
14676 switch (Arith.getOpcode()) {
14677 default: break;
14678 case ISD::ADD:
14679 case ISD::SUB:
14680 case ISD::AND:
14681 case ISD::OR:
14682 case ISD::XOR: {
14683 NeedTruncation = true;
14684 ArithOp = Arith;
14685 }
14686 }
14687 }
14688
14689 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14690 // which may be the result of a CAST. We use the variable 'Op', which is the
14691 // non-casted variable when we check for possible users.
14692 switch (ArithOp.getOpcode()) {
14693 case ISD::ADD:
14694 // Due to an isel shortcoming, be conservative if this add is likely to be
14695 // selected as part of a load-modify-store instruction. When the root node
14696 // in a match is a store, isel doesn't know how to remap non-chain non-flag
14697 // uses of other nodes in the match, such as the ADD in this case. This
14698 // leads to the ADD being left around and reselected, with the result being
14699 // two adds in the output. Alas, even if none our users are stores, that
14700 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
14701 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
14702 // climbing the DAG back to the root, and it doesn't seem to be worth the
14703 // effort.
14704 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14705 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14706 if (UI->getOpcode() != ISD::CopyToReg &&
14707 UI->getOpcode() != ISD::SETCC &&
14708 UI->getOpcode() != ISD::STORE)
14709 goto default_case;
14710
14711 if (ConstantSDNode *C =
14712 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14713 // An add of one will be selected as an INC.
14714 if (C->isOne() && !Subtarget.slowIncDec()) {
14715 Opcode = X86ISD::INC;
14716 NumOperands = 1;
14717 break;
14718 }
14719
14720 // An add of negative one (subtract of one) will be selected as a DEC.
14721 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
14722 Opcode = X86ISD::DEC;
14723 NumOperands = 1;
14724 break;
14725 }
14726 }
14727
14728 // Otherwise use a regular EFLAGS-setting add.
14729 Opcode = X86ISD::ADD;
14730 NumOperands = 2;
14731 break;
14732 case ISD::SHL:
14733 case ISD::SRL:
14734 // If we have a constant logical shift that's only used in a comparison
14735 // against zero turn it into an equivalent AND. This allows turning it into
14736 // a TEST instruction later.
14737 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14738 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14739 EVT VT = Op.getValueType();
14740 unsigned BitWidth = VT.getSizeInBits();
14741 unsigned ShAmt = Op->getConstantOperandVal(1);
14742 if (ShAmt >= BitWidth) // Avoid undefined shifts.
14743 break;
14744 APInt Mask = ArithOp.getOpcode() == ISD::SRL
14745 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14746 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14747 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14748 break;
14749 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14750 DAG.getConstant(Mask, dl, VT));
14751 }
14752 break;
14753
14754 case ISD::AND:
14755 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
14756 // because a TEST instruction will be better.
14757 if (!hasNonFlagsUse(Op)) {
14758 SDValue Op0 = ArithOp->getOperand(0);
14759 SDValue Op1 = ArithOp->getOperand(1);
14760 EVT VT = ArithOp.getValueType();
14761 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
14762 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
14763
14764 // But if we can combine this into an ANDN operation, then create an AND
14765 // now and allow it to be pattern matched into an ANDN.
14766 if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
14767 break;
14768 }
14769 // FALL THROUGH
14770 case ISD::SUB:
14771 case ISD::OR:
14772 case ISD::XOR:
14773 // Due to the ISEL shortcoming noted above, be conservative if this op is
14774 // likely to be selected as part of a load-modify-store instruction.
14775 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14776 UE = Op.getNode()->use_end(); UI != UE; ++UI)
14777 if (UI->getOpcode() == ISD::STORE)
14778 goto default_case;
14779
14780 // Otherwise use a regular EFLAGS-setting instruction.
14781 switch (ArithOp.getOpcode()) {
14782 default: llvm_unreachable("unexpected operator!");
14783 case ISD::SUB: Opcode = X86ISD::SUB; break;
14784 case ISD::XOR: Opcode = X86ISD::XOR; break;
14785 case ISD::AND: Opcode = X86ISD::AND; break;
14786 case ISD::OR: {
14787 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14788 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
14789 return EFLAGS;
14790 }
14791 Opcode = X86ISD::OR;
14792 break;
14793 }
14794 }
14795
14796 NumOperands = 2;
14797 break;
14798 case X86ISD::ADD:
14799 case X86ISD::SUB:
14800 case X86ISD::INC:
14801 case X86ISD::DEC:
14802 case X86ISD::OR:
14803 case X86ISD::XOR:
14804 case X86ISD::AND:
14805 return SDValue(Op.getNode(), 1);
14806 default:
14807 default_case:
14808 break;
14809 }
14810
14811 // If we found that truncation is beneficial, perform the truncation and
14812 // update 'Op'.
14813 if (NeedTruncation) {
14814 EVT VT = Op.getValueType();
14815 SDValue WideVal = Op->getOperand(0);
14816 EVT WideVT = WideVal.getValueType();
14817 unsigned ConvertedOp = 0;
14818 // Use a target machine opcode to prevent further DAGCombine
14819 // optimizations that may separate the arithmetic operations
14820 // from the setcc node.
14821 switch (WideVal.getOpcode()) {
14822 default: break;
14823 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
14824 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
14825 case ISD::AND: ConvertedOp = X86ISD::AND; break;
14826 case ISD::OR: ConvertedOp = X86ISD::OR; break;
14827 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
14828 }
14829
14830 if (ConvertedOp) {
14831 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14832 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
14833 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
14834 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
14835 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
14836 }
14837 }
14838 }
14839
14840 if (Opcode == 0) {
14841 // Emit KTEST for bit vectors
14842 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14843 return Node;
14844
14845 // Emit a CMP with 0, which is the TEST pattern.
14846 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14847 DAG.getConstant(0, dl, Op.getValueType()));
14848 }
14849 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
14850 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
14851
14852 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
14853 DAG.ReplaceAllUsesWith(Op, New);
14854 return SDValue(New.getNode(), 1);
14855 }
14856
14857 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
14858 /// equivalent.
EmitCmp(SDValue Op0,SDValue Op1,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG) const14859 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
14860 const SDLoc &dl, SelectionDAG &DAG) const {
14861 if (isNullConstant(Op1))
14862 return EmitTest(Op0, X86CC, dl, DAG);
14863
14864 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
14865 "Unexpected comparison operation for MVT::i1 operands");
14866
14867 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
14868 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
14869 // Only promote the compare up to I32 if it is a 16 bit operation
14870 // with an immediate. 16 bit immediates are to be avoided.
14871 if ((Op0.getValueType() == MVT::i16 &&
14872 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
14873 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
14874 !Subtarget.isAtom()) {
14875 unsigned ExtendOp =
14876 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
14877 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
14878 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
14879 }
14880 // Use SUB instead of CMP to enable CSE between SUB and CMP.
14881 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
14882 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
14883 Op0, Op1);
14884 return SDValue(Sub.getNode(), 1);
14885 }
14886 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
14887 }
14888
14889 /// Convert a comparison if required by the subtarget.
ConvertCmpIfNecessary(SDValue Cmp,SelectionDAG & DAG) const14890 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
14891 SelectionDAG &DAG) const {
14892 // If the subtarget does not support the FUCOMI instruction, floating-point
14893 // comparisons have to be converted.
14894 if (Subtarget.hasCMov() ||
14895 Cmp.getOpcode() != X86ISD::CMP ||
14896 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
14897 !Cmp.getOperand(1).getValueType().isFloatingPoint())
14898 return Cmp;
14899
14900 // The instruction selector will select an FUCOM instruction instead of
14901 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
14902 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
14903 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
14904 SDLoc dl(Cmp);
14905 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
14906 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
14907 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
14908 DAG.getConstant(8, dl, MVT::i8));
14909 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
14910
14911 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
14912 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
14913 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
14914 }
14915
14916 /// The minimum architected relative accuracy is 2^-12. We need one
14917 /// Newton-Raphson step to have a good float result (24 bits of precision).
getRsqrtEstimate(SDValue Op,DAGCombinerInfo & DCI,unsigned & RefinementSteps,bool & UseOneConstNR) const14918 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
14919 DAGCombinerInfo &DCI,
14920 unsigned &RefinementSteps,
14921 bool &UseOneConstNR) const {
14922 EVT VT = Op.getValueType();
14923 const char *RecipOp;
14924
14925 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
14926 // TODO: Add support for AVX512 (v16f32).
14927 // It is likely not profitable to do this for f64 because a double-precision
14928 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
14929 // instructions: convert to single, rsqrtss, convert back to double, refine
14930 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
14931 // along with FMA, this could be a throughput win.
14932 if (VT == MVT::f32 && Subtarget.hasSSE1())
14933 RecipOp = "sqrtf";
14934 else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14935 (VT == MVT::v8f32 && Subtarget.hasAVX()))
14936 RecipOp = "vec-sqrtf";
14937 else
14938 return SDValue();
14939
14940 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14941 if (!Recips.isEnabled(RecipOp))
14942 return SDValue();
14943
14944 RefinementSteps = Recips.getRefinementSteps(RecipOp);
14945 UseOneConstNR = false;
14946 return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
14947 }
14948
14949 /// The minimum architected relative accuracy is 2^-12. We need one
14950 /// Newton-Raphson step to have a good float result (24 bits of precision).
getRecipEstimate(SDValue Op,DAGCombinerInfo & DCI,unsigned & RefinementSteps) const14951 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
14952 DAGCombinerInfo &DCI,
14953 unsigned &RefinementSteps) const {
14954 EVT VT = Op.getValueType();
14955 const char *RecipOp;
14956
14957 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
14958 // TODO: Add support for AVX512 (v16f32).
14959 // It is likely not profitable to do this for f64 because a double-precision
14960 // reciprocal estimate with refinement on x86 prior to FMA requires
14961 // 15 instructions: convert to single, rcpss, convert back to double, refine
14962 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
14963 // along with FMA, this could be a throughput win.
14964 if (VT == MVT::f32 && Subtarget.hasSSE1())
14965 RecipOp = "divf";
14966 else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14967 (VT == MVT::v8f32 && Subtarget.hasAVX()))
14968 RecipOp = "vec-divf";
14969 else
14970 return SDValue();
14971
14972 TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14973 if (!Recips.isEnabled(RecipOp))
14974 return SDValue();
14975
14976 RefinementSteps = Recips.getRefinementSteps(RecipOp);
14977 return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
14978 }
14979
14980 /// If we have at least two divisions that use the same divisor, convert to
14981 /// multplication by a reciprocal. This may need to be adjusted for a given
14982 /// CPU if a division's cost is not at least twice the cost of a multiplication.
14983 /// This is because we still need one division to calculate the reciprocal and
14984 /// then we need two multiplies by that reciprocal as replacements for the
14985 /// original divisions.
combineRepeatedFPDivisors() const14986 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
14987 return 2;
14988 }
14989
14990 /// Result of 'and' is compared against zero. Change to a BT node if possible.
LowerToBT(SDValue And,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG) const14991 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
14992 const SDLoc &dl, SelectionDAG &DAG) const {
14993 SDValue Op0 = And.getOperand(0);
14994 SDValue Op1 = And.getOperand(1);
14995 if (Op0.getOpcode() == ISD::TRUNCATE)
14996 Op0 = Op0.getOperand(0);
14997 if (Op1.getOpcode() == ISD::TRUNCATE)
14998 Op1 = Op1.getOperand(0);
14999
15000 SDValue LHS, RHS;
15001 if (Op1.getOpcode() == ISD::SHL)
15002 std::swap(Op0, Op1);
15003 if (Op0.getOpcode() == ISD::SHL) {
15004 if (isOneConstant(Op0.getOperand(0))) {
15005 // If we looked past a truncate, check that it's only truncating away
15006 // known zeros.
15007 unsigned BitWidth = Op0.getValueSizeInBits();
15008 unsigned AndBitWidth = And.getValueSizeInBits();
15009 if (BitWidth > AndBitWidth) {
15010 APInt Zeros, Ones;
15011 DAG.computeKnownBits(Op0, Zeros, Ones);
15012 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15013 return SDValue();
15014 }
15015 LHS = Op1;
15016 RHS = Op0.getOperand(1);
15017 }
15018 } else if (Op1.getOpcode() == ISD::Constant) {
15019 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15020 uint64_t AndRHSVal = AndRHS->getZExtValue();
15021 SDValue AndLHS = Op0;
15022
15023 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15024 LHS = AndLHS.getOperand(0);
15025 RHS = AndLHS.getOperand(1);
15026 }
15027
15028 // Use BT if the immediate can't be encoded in a TEST instruction.
15029 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15030 LHS = AndLHS;
15031 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
15032 }
15033 }
15034
15035 if (LHS.getNode()) {
15036 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
15037 // instruction. Since the shift amount is in-range-or-undefined, we know
15038 // that doing a bittest on the i32 value is ok. We extend to i32 because
15039 // the encoding for the i16 version is larger than the i32 version.
15040 // Also promote i16 to i32 for performance / code size reason.
15041 if (LHS.getValueType() == MVT::i8 ||
15042 LHS.getValueType() == MVT::i16)
15043 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15044
15045 // If the operand types disagree, extend the shift amount to match. Since
15046 // BT ignores high bits (like shifts) we can use anyextend.
15047 if (LHS.getValueType() != RHS.getValueType())
15048 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15049
15050 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15051 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15052 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15053 DAG.getConstant(Cond, dl, MVT::i8), BT);
15054 }
15055
15056 return SDValue();
15057 }
15058
15059 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
15060 /// CMPs.
translateX86FSETCC(ISD::CondCode SetCCOpcode,SDValue & Op0,SDValue & Op1)15061 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15062 SDValue &Op1) {
15063 unsigned SSECC;
15064 bool Swap = false;
15065
15066 // SSE Condition code mapping:
15067 // 0 - EQ
15068 // 1 - LT
15069 // 2 - LE
15070 // 3 - UNORD
15071 // 4 - NEQ
15072 // 5 - NLT
15073 // 6 - NLE
15074 // 7 - ORD
15075 switch (SetCCOpcode) {
15076 default: llvm_unreachable("Unexpected SETCC condition");
15077 case ISD::SETOEQ:
15078 case ISD::SETEQ: SSECC = 0; break;
15079 case ISD::SETOGT:
15080 case ISD::SETGT: Swap = true; // Fallthrough
15081 case ISD::SETLT:
15082 case ISD::SETOLT: SSECC = 1; break;
15083 case ISD::SETOGE:
15084 case ISD::SETGE: Swap = true; // Fallthrough
15085 case ISD::SETLE:
15086 case ISD::SETOLE: SSECC = 2; break;
15087 case ISD::SETUO: SSECC = 3; break;
15088 case ISD::SETUNE:
15089 case ISD::SETNE: SSECC = 4; break;
15090 case ISD::SETULE: Swap = true; // Fallthrough
15091 case ISD::SETUGE: SSECC = 5; break;
15092 case ISD::SETULT: Swap = true; // Fallthrough
15093 case ISD::SETUGT: SSECC = 6; break;
15094 case ISD::SETO: SSECC = 7; break;
15095 case ISD::SETUEQ:
15096 case ISD::SETONE: SSECC = 8; break;
15097 }
15098 if (Swap)
15099 std::swap(Op0, Op1);
15100
15101 return SSECC;
15102 }
15103
15104 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
15105 /// concatenate the result back.
Lower256IntVSETCC(SDValue Op,SelectionDAG & DAG)15106 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15107 MVT VT = Op.getSimpleValueType();
15108
15109 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15110 "Unsupported value type for operation");
15111
15112 unsigned NumElems = VT.getVectorNumElements();
15113 SDLoc dl(Op);
15114 SDValue CC = Op.getOperand(2);
15115
15116 // Extract the LHS vectors
15117 SDValue LHS = Op.getOperand(0);
15118 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
15119 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
15120
15121 // Extract the RHS vectors
15122 SDValue RHS = Op.getOperand(1);
15123 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
15124 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
15125
15126 // Issue the operation on the smaller types and concatenate the result back
15127 MVT EltVT = VT.getVectorElementType();
15128 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15129 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15130 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15131 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15132 }
15133
LowerBoolVSETCC_AVX512(SDValue Op,SelectionDAG & DAG)15134 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15135 SDValue Op0 = Op.getOperand(0);
15136 SDValue Op1 = Op.getOperand(1);
15137 SDValue CC = Op.getOperand(2);
15138 MVT VT = Op.getSimpleValueType();
15139 SDLoc dl(Op);
15140
15141 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15142 "Unexpected type for boolean compare operation");
15143 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15144 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
15145 DAG.getConstant(-1, dl, VT));
15146 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
15147 DAG.getConstant(-1, dl, VT));
15148 switch (SetCCOpcode) {
15149 default: llvm_unreachable("Unexpected SETCC condition");
15150 case ISD::SETEQ:
15151 // (x == y) -> ~(x ^ y)
15152 return DAG.getNode(ISD::XOR, dl, VT,
15153 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
15154 DAG.getConstant(-1, dl, VT));
15155 case ISD::SETNE:
15156 // (x != y) -> (x ^ y)
15157 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
15158 case ISD::SETUGT:
15159 case ISD::SETGT:
15160 // (x > y) -> (x & ~y)
15161 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
15162 case ISD::SETULT:
15163 case ISD::SETLT:
15164 // (x < y) -> (~x & y)
15165 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
15166 case ISD::SETULE:
15167 case ISD::SETLE:
15168 // (x <= y) -> (~x | y)
15169 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
15170 case ISD::SETUGE:
15171 case ISD::SETGE:
15172 // (x >=y) -> (x | ~y)
15173 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
15174 }
15175 }
15176
LowerIntVSETCC_AVX512(SDValue Op,SelectionDAG & DAG)15177 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15178
15179 SDValue Op0 = Op.getOperand(0);
15180 SDValue Op1 = Op.getOperand(1);
15181 SDValue CC = Op.getOperand(2);
15182 MVT VT = Op.getSimpleValueType();
15183 SDLoc dl(Op);
15184
15185 assert(VT.getVectorElementType() == MVT::i1 &&
15186 "Cannot set masked compare for this operation");
15187
15188 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15189 unsigned Opc = 0;
15190 bool Unsigned = false;
15191 bool Swap = false;
15192 unsigned SSECC;
15193 switch (SetCCOpcode) {
15194 default: llvm_unreachable("Unexpected SETCC condition");
15195 case ISD::SETNE: SSECC = 4; break;
15196 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
15197 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15198 case ISD::SETLT: Swap = true; //fall-through
15199 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
15200 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15201 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15202 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
15203 case ISD::SETULE: Unsigned = true; //fall-through
15204 case ISD::SETLE: SSECC = 2; break;
15205 }
15206
15207 if (Swap)
15208 std::swap(Op0, Op1);
15209 if (Opc)
15210 return DAG.getNode(Opc, dl, VT, Op0, Op1);
15211 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15212 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15213 DAG.getConstant(SSECC, dl, MVT::i8));
15214 }
15215
15216 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15217 /// operand \p Op1. If non-trivial (for example because it's not constant)
15218 /// return an empty value.
ChangeVSETULTtoVSETULE(const SDLoc & dl,SDValue Op1,SelectionDAG & DAG)15219 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
15220 SelectionDAG &DAG) {
15221 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15222 if (!BV)
15223 return SDValue();
15224
15225 MVT VT = Op1.getSimpleValueType();
15226 MVT EVT = VT.getVectorElementType();
15227 unsigned n = VT.getVectorNumElements();
15228 SmallVector<SDValue, 8> ULTOp1;
15229
15230 for (unsigned i = 0; i < n; ++i) {
15231 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15232 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
15233 return SDValue();
15234
15235 // Avoid underflow.
15236 APInt Val = Elt->getAPIntValue();
15237 if (Val == 0)
15238 return SDValue();
15239
15240 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
15241 }
15242
15243 return DAG.getBuildVector(VT, dl, ULTOp1);
15244 }
15245
LowerVSETCC(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)15246 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
15247 SelectionDAG &DAG) {
15248 SDValue Op0 = Op.getOperand(0);
15249 SDValue Op1 = Op.getOperand(1);
15250 SDValue CC = Op.getOperand(2);
15251 MVT VT = Op.getSimpleValueType();
15252 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15253 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15254 SDLoc dl(Op);
15255
15256 if (isFP) {
15257 #ifndef NDEBUG
15258 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15259 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15260 #endif
15261
15262 unsigned Opc;
15263 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15264 assert(VT.getVectorNumElements() <= 16);
15265 Opc = X86ISD::CMPM;
15266 } else {
15267 Opc = X86ISD::CMPP;
15268 // The SSE/AVX packed FP comparison nodes are defined with a
15269 // floating-point vector result that matches the operand type. This allows
15270 // them to work with an SSE1 target (integer vector types are not legal).
15271 VT = Op0.getSimpleValueType();
15272 }
15273
15274 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
15275 // emit two comparisons and a logic op to tie them together.
15276 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
15277 // available.
15278 SDValue Cmp;
15279 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15280 if (SSECC == 8) {
15281 // LLVM predicate is SETUEQ or SETONE.
15282 unsigned CC0, CC1;
15283 unsigned CombineOpc;
15284 if (SetCCOpcode == ISD::SETUEQ) {
15285 CC0 = 3; // UNORD
15286 CC1 = 0; // EQ
15287 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
15288 static_cast<unsigned>(ISD::OR);
15289 } else {
15290 assert(SetCCOpcode == ISD::SETONE);
15291 CC0 = 7; // ORD
15292 CC1 = 4; // NEQ
15293 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
15294 static_cast<unsigned>(ISD::AND);
15295 }
15296
15297 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15298 DAG.getConstant(CC0, dl, MVT::i8));
15299 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15300 DAG.getConstant(CC1, dl, MVT::i8));
15301 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15302 } else {
15303 // Handle all other FP comparisons here.
15304 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
15305 DAG.getConstant(SSECC, dl, MVT::i8));
15306 }
15307
15308 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
15309 // result type of SETCC. The bitcast is expected to be optimized away
15310 // during combining/isel.
15311 if (Opc == X86ISD::CMPP)
15312 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
15313
15314 return Cmp;
15315 }
15316
15317 MVT VTOp0 = Op0.getSimpleValueType();
15318 assert(VTOp0 == Op1.getSimpleValueType() &&
15319 "Expected operands with same type!");
15320 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
15321 "Invalid number of packed elements for source and destination!");
15322
15323 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
15324 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
15325 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
15326 // legalizer firstly checks if the first operand in input to the setcc has
15327 // a legal type. If so, then it promotes the return type to that same type.
15328 // Otherwise, the return type is promoted to the 'next legal type' which,
15329 // for a vector of MVT::i1 is always a 128-bit integer vector type.
15330 //
15331 // We reach this code only if the following two conditions are met:
15332 // 1. Both return type and operand type have been promoted to wider types
15333 // by the type legalizer.
15334 // 2. The original operand type has been promoted to a 256-bit vector.
15335 //
15336 // Note that condition 2. only applies for AVX targets.
15337 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
15338 return DAG.getZExtOrTrunc(NewOp, dl, VT);
15339 }
15340
15341 // The non-AVX512 code below works under the assumption that source and
15342 // destination types are the same.
15343 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
15344 "Value types for source and destination must be the same!");
15345
15346 // Break 256-bit integer vector compare into smaller ones.
15347 if (VT.is256BitVector() && !Subtarget.hasInt256())
15348 return Lower256IntVSETCC(Op, DAG);
15349
15350 // Operands are boolean (vectors of i1)
15351 MVT OpVT = Op1.getSimpleValueType();
15352 if (OpVT.getVectorElementType() == MVT::i1)
15353 return LowerBoolVSETCC_AVX512(Op, DAG);
15354
15355 // The result is boolean, but operands are int/float
15356 if (VT.getVectorElementType() == MVT::i1) {
15357 // In AVX-512 architecture setcc returns mask with i1 elements,
15358 // But there is no compare instruction for i8 and i16 elements in KNL.
15359 // In this case use SSE compare
15360 bool UseAVX512Inst =
15361 (OpVT.is512BitVector() ||
15362 OpVT.getVectorElementType().getSizeInBits() >= 32 ||
15363 (Subtarget.hasBWI() && Subtarget.hasVLX()));
15364
15365 if (UseAVX512Inst)
15366 return LowerIntVSETCC_AVX512(Op, DAG);
15367
15368 return DAG.getNode(ISD::TRUNCATE, dl, VT,
15369 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15370 }
15371
15372 // Lower using XOP integer comparisons.
15373 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
15374 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
15375 // Translate compare code to XOP PCOM compare mode.
15376 unsigned CmpMode = 0;
15377 switch (SetCCOpcode) {
15378 default: llvm_unreachable("Unexpected SETCC condition");
15379 case ISD::SETULT:
15380 case ISD::SETLT: CmpMode = 0x00; break;
15381 case ISD::SETULE:
15382 case ISD::SETLE: CmpMode = 0x01; break;
15383 case ISD::SETUGT:
15384 case ISD::SETGT: CmpMode = 0x02; break;
15385 case ISD::SETUGE:
15386 case ISD::SETGE: CmpMode = 0x03; break;
15387 case ISD::SETEQ: CmpMode = 0x04; break;
15388 case ISD::SETNE: CmpMode = 0x05; break;
15389 }
15390
15391 // Are we comparing unsigned or signed integers?
15392 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
15393 ? X86ISD::VPCOMU : X86ISD::VPCOM;
15394
15395 return DAG.getNode(Opc, dl, VT, Op0, Op1,
15396 DAG.getConstant(CmpMode, dl, MVT::i8));
15397 }
15398
15399 // We are handling one of the integer comparisons here. Since SSE only has
15400 // GT and EQ comparisons for integer, swapping operands and multiple
15401 // operations may be required for some comparisons.
15402 unsigned Opc;
15403 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15404 bool Subus = false;
15405
15406 switch (SetCCOpcode) {
15407 default: llvm_unreachable("Unexpected SETCC condition");
15408 case ISD::SETNE: Invert = true;
15409 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
15410 case ISD::SETLT: Swap = true;
15411 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
15412 case ISD::SETGE: Swap = true;
15413 case ISD::SETLE: Opc = X86ISD::PCMPGT;
15414 Invert = true; break;
15415 case ISD::SETULT: Swap = true;
15416 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15417 FlipSigns = true; break;
15418 case ISD::SETUGE: Swap = true;
15419 case ISD::SETULE: Opc = X86ISD::PCMPGT;
15420 FlipSigns = true; Invert = true; break;
15421 }
15422
15423 // Special case: Use min/max operations for SETULE/SETUGE
15424 MVT VET = VT.getVectorElementType();
15425 bool hasMinMax =
15426 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15427 || (Subtarget.hasSSE2() && (VET == MVT::i8));
15428
15429 if (hasMinMax) {
15430 switch (SetCCOpcode) {
15431 default: break;
15432 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
15433 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
15434 }
15435
15436 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15437 }
15438
15439 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15440 if (!MinMax && hasSubus) {
15441 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15442 // Op0 u<= Op1:
15443 // t = psubus Op0, Op1
15444 // pcmpeq t, <0..0>
15445 switch (SetCCOpcode) {
15446 default: break;
15447 case ISD::SETULT: {
15448 // If the comparison is against a constant we can turn this into a
15449 // setule. With psubus, setule does not require a swap. This is
15450 // beneficial because the constant in the register is no longer
15451 // destructed as the destination so it can be hoisted out of a loop.
15452 // Only do this pre-AVX since vpcmp* is no longer destructive.
15453 if (Subtarget.hasAVX())
15454 break;
15455 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
15456 Op1 = ULEOp1;
15457 Subus = true; Invert = false; Swap = false;
15458 }
15459 break;
15460 }
15461 // Psubus is better than flip-sign because it requires no inversion.
15462 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
15463 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15464 }
15465
15466 if (Subus) {
15467 Opc = X86ISD::SUBUS;
15468 FlipSigns = false;
15469 }
15470 }
15471
15472 if (Swap)
15473 std::swap(Op0, Op1);
15474
15475 // Check that the operation in question is available (most are plain SSE2,
15476 // but PCMPGTQ and PCMPEQQ have different requirements).
15477 if (VT == MVT::v2i64) {
15478 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
15479 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
15480
15481 // First cast everything to the right type.
15482 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15483 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15484
15485 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15486 // bits of the inputs before performing those operations. The lower
15487 // compare is always unsigned.
15488 SDValue SB;
15489 if (FlipSigns) {
15490 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
15491 } else {
15492 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
15493 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
15494 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
15495 }
15496 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15497 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15498
15499 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15500 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15501 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15502
15503 // Create masks for only the low parts/high parts of the 64 bit integers.
15504 static const int MaskHi[] = { 1, 1, 3, 3 };
15505 static const int MaskLo[] = { 0, 0, 2, 2 };
15506 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15507 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15508 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15509
15510 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15511 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15512
15513 if (Invert)
15514 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15515
15516 return DAG.getBitcast(VT, Result);
15517 }
15518
15519 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
15520 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15521 // pcmpeqd + pshufd + pand.
15522 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
15523
15524 // First cast everything to the right type.
15525 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15526 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15527
15528 // Do the compare.
15529 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15530
15531 // Make sure the lower and upper halves are both all-ones.
15532 static const int Mask[] = { 1, 0, 3, 2 };
15533 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15534 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15535
15536 if (Invert)
15537 Result = DAG.getNOT(dl, Result, MVT::v4i32);
15538
15539 return DAG.getBitcast(VT, Result);
15540 }
15541 }
15542
15543 // Since SSE has no unsigned integer comparisons, we need to flip the sign
15544 // bits of the inputs before performing those operations.
15545 if (FlipSigns) {
15546 MVT EltVT = VT.getVectorElementType();
15547 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
15548 VT);
15549 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15550 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15551 }
15552
15553 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15554
15555 // If the logical-not of the result is required, perform that now.
15556 if (Invert)
15557 Result = DAG.getNOT(dl, Result, VT);
15558
15559 if (MinMax)
15560 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15561
15562 if (Subus)
15563 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15564 getZeroVector(VT, Subtarget, DAG, dl));
15565
15566 return Result;
15567 }
15568
LowerSETCC(SDValue Op,SelectionDAG & DAG) const15569 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15570
15571 MVT VT = Op.getSimpleValueType();
15572
15573 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15574
15575 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15576 && "SetCC type must be 8-bit or 1-bit integer");
15577 SDValue Op0 = Op.getOperand(0);
15578 SDValue Op1 = Op.getOperand(1);
15579 SDLoc dl(Op);
15580 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15581
15582 // Optimize to BT if possible.
15583 // Lower (X & (1 << N)) == 0 to BT(X, N).
15584 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15585 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15586 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15587 isNullConstant(Op1) &&
15588 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15589 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
15590 if (VT == MVT::i1) {
15591 NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
15592 DAG.getValueType(MVT::i1));
15593 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15594 }
15595 return NewSetCC;
15596 }
15597 }
15598
15599 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
15600 // these.
15601 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
15602 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15603
15604 // If the input is a setcc, then reuse the input setcc or use a new one with
15605 // the inverted condition.
15606 if (Op0.getOpcode() == X86ISD::SETCC) {
15607 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15608 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
15609 if (!Invert)
15610 return Op0;
15611
15612 CCode = X86::GetOppositeBranchCondition(CCode);
15613 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15614 DAG.getConstant(CCode, dl, MVT::i8),
15615 Op0.getOperand(1));
15616 if (VT == MVT::i1) {
15617 SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15618 DAG.getValueType(MVT::i1));
15619 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15620 }
15621 return SetCC;
15622 }
15623 }
15624 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15625 if (isOneConstant(Op1)) {
15626 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15627 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
15628 }
15629 if (!isNullConstant(Op1)) {
15630 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
15631 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
15632 }
15633 }
15634
15635 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15636 unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
15637 if (X86CC == X86::COND_INVALID)
15638 return SDValue();
15639
15640 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15641 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15642 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15643 DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
15644 if (VT == MVT::i1) {
15645 SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15646 DAG.getValueType(MVT::i1));
15647 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15648 }
15649 return SetCC;
15650 }
15651
LowerSETCCE(SDValue Op,SelectionDAG & DAG) const15652 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
15653 SDValue LHS = Op.getOperand(0);
15654 SDValue RHS = Op.getOperand(1);
15655 SDValue Carry = Op.getOperand(2);
15656 SDValue Cond = Op.getOperand(3);
15657 SDLoc DL(Op);
15658
15659 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
15660 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
15661
15662 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
15663 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15664 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
15665 SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15666 DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
15667 if (Op.getSimpleValueType() == MVT::i1) {
15668 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
15669 DAG.getValueType(MVT::i1));
15670 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
15671 }
15672 return SetCC;
15673 }
15674
15675 /// Return true if opcode is a X86 logical comparison.
isX86LogicalCmp(SDValue Op)15676 static bool isX86LogicalCmp(SDValue Op) {
15677 unsigned Opc = Op.getNode()->getOpcode();
15678 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15679 Opc == X86ISD::SAHF)
15680 return true;
15681 if (Op.getResNo() == 1 &&
15682 (Opc == X86ISD::ADD ||
15683 Opc == X86ISD::SUB ||
15684 Opc == X86ISD::ADC ||
15685 Opc == X86ISD::SBB ||
15686 Opc == X86ISD::SMUL ||
15687 Opc == X86ISD::UMUL ||
15688 Opc == X86ISD::INC ||
15689 Opc == X86ISD::DEC ||
15690 Opc == X86ISD::OR ||
15691 Opc == X86ISD::XOR ||
15692 Opc == X86ISD::AND))
15693 return true;
15694
15695 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15696 return true;
15697
15698 return false;
15699 }
15700
15701 /// Returns the "condition" node, that may be wrapped with "truncate".
15702 /// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
getCondAfterTruncWithZeroHighBitsInput(SDValue V,SelectionDAG & DAG)15703 static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15704 if (V.getOpcode() != ISD::TRUNCATE)
15705 return V;
15706
15707 SDValue VOp0 = V.getOperand(0);
15708 if (VOp0.getOpcode() == ISD::AssertZext &&
15709 V.getValueSizeInBits() ==
15710 cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
15711 return VOp0.getOperand(0);
15712
15713 unsigned InBits = VOp0.getValueSizeInBits();
15714 unsigned Bits = V.getValueSizeInBits();
15715 if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
15716 return V.getOperand(0);
15717 return V;
15718 }
15719
LowerSELECT(SDValue Op,SelectionDAG & DAG) const15720 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15721 bool addTest = true;
15722 SDValue Cond = Op.getOperand(0);
15723 SDValue Op1 = Op.getOperand(1);
15724 SDValue Op2 = Op.getOperand(2);
15725 SDLoc DL(Op);
15726 MVT VT = Op1.getSimpleValueType();
15727 SDValue CC;
15728
15729 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15730 // are available or VBLENDV if AVX is available.
15731 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
15732 if (Cond.getOpcode() == ISD::SETCC &&
15733 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15734 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
15735 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
15736 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15737 int SSECC = translateX86FSETCC(
15738 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15739
15740 if (SSECC != 8) {
15741 if (Subtarget.hasAVX512()) {
15742 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15743 DAG.getConstant(SSECC, DL, MVT::i8));
15744 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15745 }
15746
15747 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15748 DAG.getConstant(SSECC, DL, MVT::i8));
15749
15750 // If we have AVX, we can use a variable vector select (VBLENDV) instead
15751 // of 3 logic instructions for size savings and potentially speed.
15752 // Unfortunately, there is no scalar form of VBLENDV.
15753
15754 // If either operand is a constant, don't try this. We can expect to
15755 // optimize away at least one of the logic instructions later in that
15756 // case, so that sequence would be faster than a variable blend.
15757
15758 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
15759 // uses XMM0 as the selection register. That may need just as many
15760 // instructions as the AND/ANDN/OR sequence due to register moves, so
15761 // don't bother.
15762
15763 if (Subtarget.hasAVX() &&
15764 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
15765
15766 // Convert to vectors, do a VSELECT, and convert back to scalar.
15767 // All of the conversions should be optimized away.
15768
15769 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
15770 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
15771 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
15772 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
15773
15774 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
15775 VCmp = DAG.getBitcast(VCmpVT, VCmp);
15776
15777 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
15778
15779 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
15780 VSel, DAG.getIntPtrConstant(0, DL));
15781 }
15782 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15783 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15784 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15785 }
15786 }
15787
15788 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
15789 SDValue Op1Scalar;
15790 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
15791 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
15792 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
15793 Op1Scalar = Op1.getOperand(0);
15794 SDValue Op2Scalar;
15795 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
15796 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
15797 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
15798 Op2Scalar = Op2.getOperand(0);
15799 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
15800 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
15801 Op1Scalar.getValueType(),
15802 Cond, Op1Scalar, Op2Scalar);
15803 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
15804 return DAG.getBitcast(VT, newSelect);
15805 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
15806 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
15807 DAG.getIntPtrConstant(0, DL));
15808 }
15809 }
15810
15811 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
15812 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
15813 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15814 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
15815 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15816 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
15817 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
15818 Cond, Op1, Op2);
15819 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
15820 }
15821
15822 if (Cond.getOpcode() == ISD::SETCC) {
15823 if (SDValue NewCond = LowerSETCC(Cond, DAG))
15824 Cond = NewCond;
15825 }
15826
15827 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15828 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15829 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15830 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15831 if (Cond.getOpcode() == X86ISD::SETCC &&
15832 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15833 isNullConstant(Cond.getOperand(1).getOperand(1))) {
15834 SDValue Cmp = Cond.getOperand(1);
15835
15836 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15837
15838 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15839 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15840 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
15841
15842 SDValue CmpOp0 = Cmp.getOperand(0);
15843 // Apply further optimizations for special cases
15844 // (select (x != 0), -1, 0) -> neg & sbb
15845 // (select (x == 0), 0, -1) -> neg & sbb
15846 if (isNullConstant(Y) &&
15847 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
15848 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15849 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15850 DAG.getConstant(0, DL,
15851 CmpOp0.getValueType()),
15852 CmpOp0);
15853 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15854 DAG.getConstant(X86::COND_B, DL, MVT::i8),
15855 SDValue(Neg.getNode(), 1));
15856 return Res;
15857 }
15858
15859 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15860 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
15861 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15862
15863 SDValue Res = // Res = 0 or -1.
15864 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15865 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
15866
15867 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
15868 Res = DAG.getNOT(DL, Res, Res.getValueType());
15869
15870 if (!isNullConstant(Op2))
15871 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15872 return Res;
15873 }
15874 }
15875
15876 // Look past (and (setcc_carry (cmp ...)), 1).
15877 if (Cond.getOpcode() == ISD::AND &&
15878 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
15879 isOneConstant(Cond.getOperand(1)))
15880 Cond = Cond.getOperand(0);
15881
15882 // If condition flag is set by a X86ISD::CMP, then use it as the condition
15883 // setting operand in place of the X86ISD::SETCC.
15884 unsigned CondOpcode = Cond.getOpcode();
15885 if (CondOpcode == X86ISD::SETCC ||
15886 CondOpcode == X86ISD::SETCC_CARRY) {
15887 CC = Cond.getOperand(0);
15888
15889 SDValue Cmp = Cond.getOperand(1);
15890 unsigned Opc = Cmp.getOpcode();
15891 MVT VT = Op.getSimpleValueType();
15892
15893 bool IllegalFPCMov = false;
15894 if (VT.isFloatingPoint() && !VT.isVector() &&
15895 !isScalarFPTypeInSSEReg(VT)) // FPStack?
15896 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15897
15898 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15899 Opc == X86ISD::BT) { // FIXME
15900 Cond = Cmp;
15901 addTest = false;
15902 }
15903 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15904 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15905 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15906 Cond.getOperand(0).getValueType() != MVT::i8)) {
15907 SDValue LHS = Cond.getOperand(0);
15908 SDValue RHS = Cond.getOperand(1);
15909 unsigned X86Opcode;
15910 unsigned X86Cond;
15911 SDVTList VTs;
15912 switch (CondOpcode) {
15913 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15914 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15915 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15916 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15917 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15918 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15919 default: llvm_unreachable("unexpected overflowing operator");
15920 }
15921 if (CondOpcode == ISD::UMULO)
15922 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15923 MVT::i32);
15924 else
15925 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15926
15927 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15928
15929 if (CondOpcode == ISD::UMULO)
15930 Cond = X86Op.getValue(2);
15931 else
15932 Cond = X86Op.getValue(1);
15933
15934 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
15935 addTest = false;
15936 }
15937
15938 if (addTest) {
15939 // Look past the truncate if the high bits are known zero.
15940 Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
15941
15942 // We know the result of AND is compared against zero. Try to match
15943 // it to BT.
15944 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15945 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
15946 CC = NewSetCC.getOperand(0);
15947 Cond = NewSetCC.getOperand(1);
15948 addTest = false;
15949 }
15950 }
15951 }
15952
15953 if (addTest) {
15954 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
15955 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15956 }
15957
15958 // a < b ? -1 : 0 -> RES = ~setcc_carry
15959 // a < b ? 0 : -1 -> RES = setcc_carry
15960 // a >= b ? -1 : 0 -> RES = setcc_carry
15961 // a >= b ? 0 : -1 -> RES = ~setcc_carry
15962 if (Cond.getOpcode() == X86ISD::SUB) {
15963 Cond = ConvertCmpIfNecessary(Cond, DAG);
15964 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15965
15966 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15967 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15968 (isNullConstant(Op1) || isNullConstant(Op2))) {
15969 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15970 DAG.getConstant(X86::COND_B, DL, MVT::i8),
15971 Cond);
15972 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
15973 return DAG.getNOT(DL, Res, Res.getValueType());
15974 return Res;
15975 }
15976 }
15977
15978 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15979 // widen the cmov and push the truncate through. This avoids introducing a new
15980 // branch during isel and doesn't add any extensions.
15981 if (Op.getValueType() == MVT::i8 &&
15982 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15983 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15984 if (T1.getValueType() == T2.getValueType() &&
15985 // Blacklist CopyFromReg to avoid partial register stalls.
15986 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15987 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15988 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15989 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15990 }
15991 }
15992
15993 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15994 // condition is true.
15995 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15996 SDValue Ops[] = { Op2, Op1, CC, Cond };
15997 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15998 }
15999
LowerSIGN_EXTEND_AVX512(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16000 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
16001 const X86Subtarget &Subtarget,
16002 SelectionDAG &DAG) {
16003 MVT VT = Op->getSimpleValueType(0);
16004 SDValue In = Op->getOperand(0);
16005 MVT InVT = In.getSimpleValueType();
16006 MVT VTElt = VT.getVectorElementType();
16007 MVT InVTElt = InVT.getVectorElementType();
16008 SDLoc dl(Op);
16009
16010 // SKX processor
16011 if ((InVTElt == MVT::i1) &&
16012 (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
16013 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16014
16015 ((Subtarget.hasBWI() && VT.is512BitVector() &&
16016 VTElt.getSizeInBits() <= 16)) ||
16017
16018 ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
16019 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16020
16021 ((Subtarget.hasDQI() && VT.is512BitVector() &&
16022 VTElt.getSizeInBits() >= 32))))
16023 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16024
16025 unsigned int NumElts = VT.getVectorNumElements();
16026
16027 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
16028 return SDValue();
16029
16030 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16031 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16032 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16033 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16034 }
16035
16036 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16037 MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
16038 SDValue NegOne =
16039 DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
16040 ExtVT);
16041 SDValue Zero =
16042 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
16043
16044 SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
16045 if (VT.is512BitVector())
16046 return V;
16047 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
16048 }
16049
LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16050 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
16051 const X86Subtarget &Subtarget,
16052 SelectionDAG &DAG) {
16053 SDValue In = Op->getOperand(0);
16054 MVT VT = Op->getSimpleValueType(0);
16055 MVT InVT = In.getSimpleValueType();
16056 assert(VT.getSizeInBits() == InVT.getSizeInBits());
16057
16058 MVT SVT = VT.getVectorElementType();
16059 MVT InSVT = InVT.getVectorElementType();
16060 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
16061
16062 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
16063 return SDValue();
16064 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
16065 return SDValue();
16066 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
16067 !(VT.is256BitVector() && Subtarget.hasInt256()))
16068 return SDValue();
16069
16070 SDLoc dl(Op);
16071
16072 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
16073 if (VT.is256BitVector())
16074 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
16075 MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
16076 In, DAG.getIntPtrConstant(0, dl));
16077
16078 // SSE41 targets can use the pmovsx* instructions directly.
16079 if (Subtarget.hasSSE41())
16080 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16081
16082 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
16083 SDValue Curr = In;
16084 MVT CurrVT = InVT;
16085
16086 // As SRAI is only available on i16/i32 types, we expand only up to i32
16087 // and handle i64 separately.
16088 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
16089 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
16090 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
16091 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
16092 Curr = DAG.getBitcast(CurrVT, Curr);
16093 }
16094
16095 SDValue SignExt = Curr;
16096 if (CurrVT != InVT) {
16097 unsigned SignExtShift =
16098 CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
16099 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16100 DAG.getConstant(SignExtShift, dl, MVT::i8));
16101 }
16102
16103 if (CurrVT == VT)
16104 return SignExt;
16105
16106 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
16107 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16108 DAG.getConstant(31, dl, MVT::i8));
16109 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
16110 return DAG.getBitcast(VT, Ext);
16111 }
16112
16113 return SDValue();
16114 }
16115
LowerSIGN_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16116 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16117 SelectionDAG &DAG) {
16118 MVT VT = Op->getSimpleValueType(0);
16119 SDValue In = Op->getOperand(0);
16120 MVT InVT = In.getSimpleValueType();
16121 SDLoc dl(Op);
16122
16123 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16124 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16125
16126 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16127 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16128 (VT != MVT::v16i16 || InVT != MVT::v16i8))
16129 return SDValue();
16130
16131 if (Subtarget.hasInt256())
16132 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16133
16134 // Optimize vectors in AVX mode
16135 // Sign extend v8i16 to v8i32 and
16136 // v4i32 to v4i64
16137 //
16138 // Divide input vector into two parts
16139 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16140 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16141 // concat the vectors to original VT
16142
16143 unsigned NumElems = InVT.getVectorNumElements();
16144 SDValue Undef = DAG.getUNDEF(InVT);
16145
16146 SmallVector<int,8> ShufMask1(NumElems, -1);
16147 for (unsigned i = 0; i != NumElems/2; ++i)
16148 ShufMask1[i] = i;
16149
16150 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
16151
16152 SmallVector<int,8> ShufMask2(NumElems, -1);
16153 for (unsigned i = 0; i != NumElems/2; ++i)
16154 ShufMask2[i] = i + NumElems/2;
16155
16156 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
16157
16158 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
16159 VT.getVectorNumElements()/2);
16160
16161 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16162 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16163
16164 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16165 }
16166
16167 // Lower truncating store. We need a special lowering to vXi1 vectors
LowerTruncatingStore(SDValue StOp,const X86Subtarget & Subtarget,SelectionDAG & DAG)16168 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
16169 SelectionDAG &DAG) {
16170 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
16171 SDLoc dl(St);
16172 EVT MemVT = St->getMemoryVT();
16173 assert(St->isTruncatingStore() && "We only custom truncating store.");
16174 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
16175 "Expected truncstore of i1 vector");
16176
16177 SDValue Op = St->getValue();
16178 MVT OpVT = Op.getValueType().getSimpleVT();
16179 unsigned NumElts = OpVT.getVectorNumElements();
16180 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16181 NumElts == 16) {
16182 // Truncate and store - everything is legal
16183 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
16184 if (MemVT.getSizeInBits() < 8)
16185 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
16186 DAG.getUNDEF(MVT::v8i1), Op,
16187 DAG.getIntPtrConstant(0, dl));
16188 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16189 St->getMemOperand());
16190 }
16191
16192 // A subset, assume that we have only AVX-512F
16193 if (NumElts <= 8) {
16194 if (NumElts < 8) {
16195 // Extend to 8-elts vector
16196 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
16197 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
16198 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
16199 }
16200 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
16201 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16202 St->getMemOperand());
16203 }
16204 // v32i8
16205 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
16206 // Divide the vector into 2 parts and store each part separately
16207 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16208 DAG.getIntPtrConstant(0, dl));
16209 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
16210 SDValue BasePtr = St->getBasePtr();
16211 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
16212 St->getMemOperand());
16213 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16214 DAG.getIntPtrConstant(16, dl));
16215 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
16216
16217 SDValue BasePtrHi =
16218 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16219 DAG.getConstant(2, dl, BasePtr.getValueType()));
16220
16221 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
16222 BasePtrHi, St->getMemOperand());
16223 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
16224 }
16225
LowerExtended1BitVectorLoad(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16226 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
16227 const X86Subtarget &Subtarget,
16228 SelectionDAG &DAG) {
16229
16230 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16231 SDLoc dl(Ld);
16232 EVT MemVT = Ld->getMemoryVT();
16233 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
16234 "Expected i1 vector load");
16235 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
16236 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16237 MVT VT = Op.getValueType().getSimpleVT();
16238 unsigned NumElts = VT.getVectorNumElements();
16239
16240 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16241 NumElts == 16) {
16242 // Load and extend - everything is legal
16243 if (NumElts < 8) {
16244 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
16245 Ld->getBasePtr(),
16246 Ld->getMemOperand());
16247 // Replace chain users with the new chain.
16248 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16249 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16250 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16251 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
16252
16253 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16254 DAG.getIntPtrConstant(0, dl));
16255 }
16256 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
16257 Ld->getBasePtr(),
16258 Ld->getMemOperand());
16259 // Replace chain users with the new chain.
16260 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16261 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16262
16263 // Finally, do a normal sign-extend to the desired register.
16264 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
16265 }
16266
16267 if (NumElts <= 8) {
16268 // A subset, assume that we have only AVX-512F
16269 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
16270 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
16271 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
16272 Ld->getBasePtr(),
16273 Ld->getMemOperand());
16274 // Replace chain users with the new chain.
16275 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16276 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16277
16278 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
16279 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
16280
16281 if (NumElts == 8)
16282 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
16283
16284 // we should take care to v4i1 and v2i1
16285
16286 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16287 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
16288 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16289 DAG.getIntPtrConstant(0, dl));
16290 }
16291
16292 assert(VT == MVT::v32i8 && "Unexpected extload type");
16293
16294 SmallVector<SDValue, 2> Chains;
16295
16296 SDValue BasePtr = Ld->getBasePtr();
16297 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16298 Ld->getBasePtr(),
16299 Ld->getMemOperand());
16300 Chains.push_back(LoadLo.getValue(1));
16301
16302 SDValue BasePtrHi =
16303 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16304 DAG.getConstant(2, dl, BasePtr.getValueType()));
16305
16306 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16307 BasePtrHi,
16308 Ld->getMemOperand());
16309 Chains.push_back(LoadHi.getValue(1));
16310 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16311 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
16312
16313 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
16314 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
16315 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
16316 }
16317
16318 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16319 // may emit an illegal shuffle but the expansion is still better than scalar
16320 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16321 // we'll emit a shuffle and a arithmetic shift.
16322 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16323 // TODO: It is possible to support ZExt by zeroing the undef values during
16324 // the shuffle phase or after the shuffle.
LowerExtendedLoad(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16325 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
16326 SelectionDAG &DAG) {
16327 MVT RegVT = Op.getSimpleValueType();
16328 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16329 assert(RegVT.isInteger() &&
16330 "We only custom lower integer vector sext loads.");
16331
16332 // Nothing useful we can do without SSE2 shuffles.
16333 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
16334
16335 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16336 SDLoc dl(Ld);
16337 EVT MemVT = Ld->getMemoryVT();
16338 if (MemVT.getScalarType() == MVT::i1)
16339 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
16340
16341 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16342 unsigned RegSz = RegVT.getSizeInBits();
16343
16344 ISD::LoadExtType Ext = Ld->getExtensionType();
16345
16346 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16347 && "Only anyext and sext are currently implemented.");
16348 assert(MemVT != RegVT && "Cannot extend to the same type");
16349 assert(MemVT.isVector() && "Must load a vector from memory");
16350
16351 unsigned NumElems = RegVT.getVectorNumElements();
16352 unsigned MemSz = MemVT.getSizeInBits();
16353 assert(RegSz > MemSz && "Register size must be greater than the mem size");
16354
16355 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
16356 // The only way in which we have a legal 256-bit vector result but not the
16357 // integer 256-bit operations needed to directly lower a sextload is if we
16358 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16359 // a 128-bit vector and a normal sign_extend to 256-bits that should get
16360 // correctly legalized. We do this late to allow the canonical form of
16361 // sextload to persist throughout the rest of the DAG combiner -- it wants
16362 // to fold together any extensions it can, and so will fuse a sign_extend
16363 // of an sextload into a sextload targeting a wider value.
16364 SDValue Load;
16365 if (MemSz == 128) {
16366 // Just switch this to a normal load.
16367 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16368 "it must be a legal 128-bit vector "
16369 "type!");
16370 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16371 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16372 Ld->isInvariant(), Ld->getAlignment());
16373 } else {
16374 assert(MemSz < 128 &&
16375 "Can't extend a type wider than 128 bits to a 256 bit vector!");
16376 // Do an sext load to a 128-bit vector type. We want to use the same
16377 // number of elements, but elements half as wide. This will end up being
16378 // recursively lowered by this routine, but will succeed as we definitely
16379 // have all the necessary features if we're using AVX1.
16380 EVT HalfEltVT =
16381 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16382 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16383 Load =
16384 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16385 Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16386 Ld->isNonTemporal(), Ld->isInvariant(),
16387 Ld->getAlignment());
16388 }
16389
16390 // Replace chain users with the new chain.
16391 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16392 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16393
16394 // Finally, do a normal sign-extend to the desired register.
16395 return DAG.getSExtOrTrunc(Load, dl, RegVT);
16396 }
16397
16398 // All sizes must be a power of two.
16399 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16400 "Non-power-of-two elements are not custom lowered!");
16401
16402 // Attempt to load the original value using scalar loads.
16403 // Find the largest scalar type that divides the total loaded size.
16404 MVT SclrLoadTy = MVT::i8;
16405 for (MVT Tp : MVT::integer_valuetypes()) {
16406 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16407 SclrLoadTy = Tp;
16408 }
16409 }
16410
16411 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16412 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16413 (64 <= MemSz))
16414 SclrLoadTy = MVT::f64;
16415
16416 // Calculate the number of scalar loads that we need to perform
16417 // in order to load our vector from memory.
16418 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16419
16420 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16421 "Can only lower sext loads with a single scalar load!");
16422
16423 unsigned loadRegZize = RegSz;
16424 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
16425 loadRegZize = 128;
16426
16427 // Represent our vector as a sequence of elements which are the
16428 // largest scalar that we can load.
16429 EVT LoadUnitVecVT = EVT::getVectorVT(
16430 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16431
16432 // Represent the data using the same element type that is stored in
16433 // memory. In practice, we ''widen'' MemVT.
16434 EVT WideVecVT =
16435 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16436 loadRegZize / MemVT.getScalarSizeInBits());
16437
16438 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16439 "Invalid vector type");
16440
16441 // We can't shuffle using an illegal type.
16442 assert(TLI.isTypeLegal(WideVecVT) &&
16443 "We only lower types that form legal widened vector types");
16444
16445 SmallVector<SDValue, 8> Chains;
16446 SDValue Ptr = Ld->getBasePtr();
16447 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
16448 TLI.getPointerTy(DAG.getDataLayout()));
16449 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16450
16451 for (unsigned i = 0; i < NumLoads; ++i) {
16452 // Perform a single load.
16453 SDValue ScalarLoad =
16454 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16455 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16456 Ld->getAlignment());
16457 Chains.push_back(ScalarLoad.getValue(1));
16458 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16459 // another round of DAGCombining.
16460 if (i == 0)
16461 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16462 else
16463 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16464 ScalarLoad, DAG.getIntPtrConstant(i, dl));
16465
16466 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16467 }
16468
16469 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16470
16471 // Bitcast the loaded value to a vector of the original element type, in
16472 // the size of the target vector type.
16473 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
16474 unsigned SizeRatio = RegSz / MemSz;
16475
16476 if (Ext == ISD::SEXTLOAD) {
16477 // If we have SSE4.1, we can directly emit a VSEXT node.
16478 if (Subtarget.hasSSE41()) {
16479 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16480 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16481 return Sext;
16482 }
16483
16484 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
16485 // lanes.
16486 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
16487 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
16488
16489 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
16490 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16491 return Shuff;
16492 }
16493
16494 // Redistribute the loaded elements into the different locations.
16495 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16496 for (unsigned i = 0; i != NumElems; ++i)
16497 ShuffleVec[i * SizeRatio] = i;
16498
16499 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16500 DAG.getUNDEF(WideVecVT), ShuffleVec);
16501
16502 // Bitcast to the requested type.
16503 Shuff = DAG.getBitcast(RegVT, Shuff);
16504 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16505 return Shuff;
16506 }
16507
16508 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
16509 /// each of which has no other use apart from the AND / OR.
isAndOrOfSetCCs(SDValue Op,unsigned & Opc)16510 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16511 Opc = Op.getOpcode();
16512 if (Opc != ISD::OR && Opc != ISD::AND)
16513 return false;
16514 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16515 Op.getOperand(0).hasOneUse() &&
16516 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16517 Op.getOperand(1).hasOneUse());
16518 }
16519
16520 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
16521 /// SETCC node has a single use.
isXor1OfSetCC(SDValue Op)16522 static bool isXor1OfSetCC(SDValue Op) {
16523 if (Op.getOpcode() != ISD::XOR)
16524 return false;
16525 if (isOneConstant(Op.getOperand(1)))
16526 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16527 Op.getOperand(0).hasOneUse();
16528 return false;
16529 }
16530
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const16531 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16532 bool addTest = true;
16533 SDValue Chain = Op.getOperand(0);
16534 SDValue Cond = Op.getOperand(1);
16535 SDValue Dest = Op.getOperand(2);
16536 SDLoc dl(Op);
16537 SDValue CC;
16538 bool Inverted = false;
16539
16540 if (Cond.getOpcode() == ISD::SETCC) {
16541 // Check for setcc([su]{add,sub,mul}o == 0).
16542 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16543 isNullConstant(Cond.getOperand(1)) &&
16544 Cond.getOperand(0).getResNo() == 1 &&
16545 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16546 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16547 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16548 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16549 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16550 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16551 Inverted = true;
16552 Cond = Cond.getOperand(0);
16553 } else {
16554 if (SDValue NewCond = LowerSETCC(Cond, DAG))
16555 Cond = NewCond;
16556 }
16557 }
16558 #if 0
16559 // FIXME: LowerXALUO doesn't handle these!!
16560 else if (Cond.getOpcode() == X86ISD::ADD ||
16561 Cond.getOpcode() == X86ISD::SUB ||
16562 Cond.getOpcode() == X86ISD::SMUL ||
16563 Cond.getOpcode() == X86ISD::UMUL)
16564 Cond = LowerXALUO(Cond, DAG);
16565 #endif
16566
16567 // Look pass (and (setcc_carry (cmp ...)), 1).
16568 if (Cond.getOpcode() == ISD::AND &&
16569 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
16570 isOneConstant(Cond.getOperand(1)))
16571 Cond = Cond.getOperand(0);
16572
16573 // If condition flag is set by a X86ISD::CMP, then use it as the condition
16574 // setting operand in place of the X86ISD::SETCC.
16575 unsigned CondOpcode = Cond.getOpcode();
16576 if (CondOpcode == X86ISD::SETCC ||
16577 CondOpcode == X86ISD::SETCC_CARRY) {
16578 CC = Cond.getOperand(0);
16579
16580 SDValue Cmp = Cond.getOperand(1);
16581 unsigned Opc = Cmp.getOpcode();
16582 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16583 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16584 Cond = Cmp;
16585 addTest = false;
16586 } else {
16587 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16588 default: break;
16589 case X86::COND_O:
16590 case X86::COND_B:
16591 // These can only come from an arithmetic instruction with overflow,
16592 // e.g. SADDO, UADDO.
16593 Cond = Cond.getNode()->getOperand(1);
16594 addTest = false;
16595 break;
16596 }
16597 }
16598 }
16599 CondOpcode = Cond.getOpcode();
16600 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16601 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16602 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16603 Cond.getOperand(0).getValueType() != MVT::i8)) {
16604 SDValue LHS = Cond.getOperand(0);
16605 SDValue RHS = Cond.getOperand(1);
16606 unsigned X86Opcode;
16607 unsigned X86Cond;
16608 SDVTList VTs;
16609 // Keep this in sync with LowerXALUO, otherwise we might create redundant
16610 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16611 // X86ISD::INC).
16612 switch (CondOpcode) {
16613 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16614 case ISD::SADDO:
16615 if (isOneConstant(RHS)) {
16616 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16617 break;
16618 }
16619 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16620 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16621 case ISD::SSUBO:
16622 if (isOneConstant(RHS)) {
16623 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16624 break;
16625 }
16626 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16627 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16628 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16629 default: llvm_unreachable("unexpected overflowing operator");
16630 }
16631 if (Inverted)
16632 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16633 if (CondOpcode == ISD::UMULO)
16634 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16635 MVT::i32);
16636 else
16637 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16638
16639 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16640
16641 if (CondOpcode == ISD::UMULO)
16642 Cond = X86Op.getValue(2);
16643 else
16644 Cond = X86Op.getValue(1);
16645
16646 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16647 addTest = false;
16648 } else {
16649 unsigned CondOpc;
16650 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16651 SDValue Cmp = Cond.getOperand(0).getOperand(1);
16652 if (CondOpc == ISD::OR) {
16653 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16654 // two branches instead of an explicit OR instruction with a
16655 // separate test.
16656 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16657 isX86LogicalCmp(Cmp)) {
16658 CC = Cond.getOperand(0).getOperand(0);
16659 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16660 Chain, Dest, CC, Cmp);
16661 CC = Cond.getOperand(1).getOperand(0);
16662 Cond = Cmp;
16663 addTest = false;
16664 }
16665 } else { // ISD::AND
16666 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16667 // two branches instead of an explicit AND instruction with a
16668 // separate test. However, we only do this if this block doesn't
16669 // have a fall-through edge, because this requires an explicit
16670 // jmp when the condition is false.
16671 if (Cmp == Cond.getOperand(1).getOperand(1) &&
16672 isX86LogicalCmp(Cmp) &&
16673 Op.getNode()->hasOneUse()) {
16674 X86::CondCode CCode =
16675 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16676 CCode = X86::GetOppositeBranchCondition(CCode);
16677 CC = DAG.getConstant(CCode, dl, MVT::i8);
16678 SDNode *User = *Op.getNode()->use_begin();
16679 // Look for an unconditional branch following this conditional branch.
16680 // We need this because we need to reverse the successors in order
16681 // to implement FCMP_OEQ.
16682 if (User->getOpcode() == ISD::BR) {
16683 SDValue FalseBB = User->getOperand(1);
16684 SDNode *NewBR =
16685 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16686 assert(NewBR == User);
16687 (void)NewBR;
16688 Dest = FalseBB;
16689
16690 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16691 Chain, Dest, CC, Cmp);
16692 X86::CondCode CCode =
16693 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16694 CCode = X86::GetOppositeBranchCondition(CCode);
16695 CC = DAG.getConstant(CCode, dl, MVT::i8);
16696 Cond = Cmp;
16697 addTest = false;
16698 }
16699 }
16700 }
16701 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16702 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16703 // It should be transformed during dag combiner except when the condition
16704 // is set by a arithmetics with overflow node.
16705 X86::CondCode CCode =
16706 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16707 CCode = X86::GetOppositeBranchCondition(CCode);
16708 CC = DAG.getConstant(CCode, dl, MVT::i8);
16709 Cond = Cond.getOperand(0).getOperand(1);
16710 addTest = false;
16711 } else if (Cond.getOpcode() == ISD::SETCC &&
16712 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16713 // For FCMP_OEQ, we can emit
16714 // two branches instead of an explicit AND instruction with a
16715 // separate test. However, we only do this if this block doesn't
16716 // have a fall-through edge, because this requires an explicit
16717 // jmp when the condition is false.
16718 if (Op.getNode()->hasOneUse()) {
16719 SDNode *User = *Op.getNode()->use_begin();
16720 // Look for an unconditional branch following this conditional branch.
16721 // We need this because we need to reverse the successors in order
16722 // to implement FCMP_OEQ.
16723 if (User->getOpcode() == ISD::BR) {
16724 SDValue FalseBB = User->getOperand(1);
16725 SDNode *NewBR =
16726 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16727 assert(NewBR == User);
16728 (void)NewBR;
16729 Dest = FalseBB;
16730
16731 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16732 Cond.getOperand(0), Cond.getOperand(1));
16733 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16734 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16735 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16736 Chain, Dest, CC, Cmp);
16737 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
16738 Cond = Cmp;
16739 addTest = false;
16740 }
16741 }
16742 } else if (Cond.getOpcode() == ISD::SETCC &&
16743 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16744 // For FCMP_UNE, we can emit
16745 // two branches instead of an explicit AND instruction with a
16746 // separate test. However, we only do this if this block doesn't
16747 // have a fall-through edge, because this requires an explicit
16748 // jmp when the condition is false.
16749 if (Op.getNode()->hasOneUse()) {
16750 SDNode *User = *Op.getNode()->use_begin();
16751 // Look for an unconditional branch following this conditional branch.
16752 // We need this because we need to reverse the successors in order
16753 // to implement FCMP_UNE.
16754 if (User->getOpcode() == ISD::BR) {
16755 SDValue FalseBB = User->getOperand(1);
16756 SDNode *NewBR =
16757 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16758 assert(NewBR == User);
16759 (void)NewBR;
16760
16761 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16762 Cond.getOperand(0), Cond.getOperand(1));
16763 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16764 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16765 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16766 Chain, Dest, CC, Cmp);
16767 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
16768 Cond = Cmp;
16769 addTest = false;
16770 Dest = FalseBB;
16771 }
16772 }
16773 }
16774 }
16775
16776 if (addTest) {
16777 // Look pass the truncate if the high bits are known zero.
16778 Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
16779
16780 // We know the result of AND is compared against zero. Try to match
16781 // it to BT.
16782 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16783 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
16784 CC = NewSetCC.getOperand(0);
16785 Cond = NewSetCC.getOperand(1);
16786 addTest = false;
16787 }
16788 }
16789 }
16790
16791 if (addTest) {
16792 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16793 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16794 Cond = EmitTest(Cond, X86Cond, dl, DAG);
16795 }
16796 Cond = ConvertCmpIfNecessary(Cond, DAG);
16797 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16798 Chain, Dest, CC, Cond);
16799 }
16800
16801 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16802 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16803 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16804 // that the guard pages used by the OS virtual memory manager are allocated in
16805 // correct sequence.
16806 SDValue
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const16807 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16808 SelectionDAG &DAG) const {
16809 MachineFunction &MF = DAG.getMachineFunction();
16810 bool SplitStack = MF.shouldSplitStack();
16811 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
16812 SplitStack;
16813 SDLoc dl(Op);
16814
16815 // Get the inputs.
16816 SDNode *Node = Op.getNode();
16817 SDValue Chain = Op.getOperand(0);
16818 SDValue Size = Op.getOperand(1);
16819 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16820 EVT VT = Node->getValueType(0);
16821
16822 // Chain the dynamic stack allocation so that it doesn't modify the stack
16823 // pointer when other instructions are using the stack.
16824 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
16825
16826 bool Is64Bit = Subtarget.is64Bit();
16827 MVT SPTy = getPointerTy(DAG.getDataLayout());
16828
16829 SDValue Result;
16830 if (!Lower) {
16831 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16832 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16833 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16834 " not tell us which reg is the stack pointer!");
16835
16836 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16837 Chain = SP.getValue(1);
16838 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
16839 unsigned StackAlign = TFI.getStackAlignment();
16840 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16841 if (Align > StackAlign)
16842 Result = DAG.getNode(ISD::AND, dl, VT, Result,
16843 DAG.getConstant(-(uint64_t)Align, dl, VT));
16844 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
16845 } else if (SplitStack) {
16846 MachineRegisterInfo &MRI = MF.getRegInfo();
16847
16848 if (Is64Bit) {
16849 // The 64 bit implementation of segmented stacks needs to clobber both r10
16850 // r11. This makes it impossible to use it along with nested parameters.
16851 const Function *F = MF.getFunction();
16852 for (const auto &A : F->args()) {
16853 if (A.hasNestAttr())
16854 report_fatal_error("Cannot use segmented stacks with functions that "
16855 "have nested arguments.");
16856 }
16857 }
16858
16859 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
16860 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16861 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16862 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16863 DAG.getRegister(Vreg, SPTy));
16864 } else {
16865 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16866 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
16867 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
16868
16869 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
16870 unsigned SPReg = RegInfo->getStackRegister();
16871 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16872 Chain = SP.getValue(1);
16873
16874 if (Align) {
16875 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16876 DAG.getConstant(-(uint64_t)Align, dl, VT));
16877 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16878 }
16879
16880 Result = SP;
16881 }
16882
16883 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
16884 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
16885
16886 SDValue Ops[2] = {Result, Chain};
16887 return DAG.getMergeValues(Ops, dl);
16888 }
16889
LowerVASTART(SDValue Op,SelectionDAG & DAG) const16890 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16891 MachineFunction &MF = DAG.getMachineFunction();
16892 auto PtrVT = getPointerTy(MF.getDataLayout());
16893 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16894
16895 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16896 SDLoc DL(Op);
16897
16898 if (!Subtarget.is64Bit() ||
16899 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
16900 // vastart just stores the address of the VarArgsFrameIndex slot into the
16901 // memory location argument.
16902 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16903 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16904 MachinePointerInfo(SV), false, false, 0);
16905 }
16906
16907 // __va_list_tag:
16908 // gp_offset (0 - 6 * 8)
16909 // fp_offset (48 - 48 + 8 * 16)
16910 // overflow_arg_area (point to parameters coming in memory).
16911 // reg_save_area
16912 SmallVector<SDValue, 8> MemOps;
16913 SDValue FIN = Op.getOperand(1);
16914 // Store gp_offset
16915 SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16916 DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16917 DL, MVT::i32),
16918 FIN, MachinePointerInfo(SV), false, false, 0);
16919 MemOps.push_back(Store);
16920
16921 // Store fp_offset
16922 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
16923 Store = DAG.getStore(Op.getOperand(0), DL,
16924 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
16925 MVT::i32),
16926 FIN, MachinePointerInfo(SV, 4), false, false, 0);
16927 MemOps.push_back(Store);
16928
16929 // Store ptr to overflow_arg_area
16930 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
16931 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16932 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16933 MachinePointerInfo(SV, 8),
16934 false, false, 0);
16935 MemOps.push_back(Store);
16936
16937 // Store ptr to reg_save_area.
16938 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
16939 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
16940 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
16941 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
16942 SV, Subtarget.isTarget64BitLP64() ? 16 : 12), false, false, 0);
16943 MemOps.push_back(Store);
16944 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16945 }
16946
LowerVAARG(SDValue Op,SelectionDAG & DAG) const16947 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16948 assert(Subtarget.is64Bit() &&
16949 "LowerVAARG only handles 64-bit va_arg!");
16950 assert(Op.getNode()->getNumOperands() == 4);
16951
16952 MachineFunction &MF = DAG.getMachineFunction();
16953 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
16954 // The Win64 ABI uses char* instead of a structure.
16955 return DAG.expandVAArg(Op.getNode());
16956
16957 SDValue Chain = Op.getOperand(0);
16958 SDValue SrcPtr = Op.getOperand(1);
16959 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16960 unsigned Align = Op.getConstantOperandVal(3);
16961 SDLoc dl(Op);
16962
16963 EVT ArgVT = Op.getNode()->getValueType(0);
16964 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16965 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
16966 uint8_t ArgMode;
16967
16968 // Decide which area this value should be read from.
16969 // TODO: Implement the AMD64 ABI in its entirety. This simple
16970 // selection mechanism works only for the basic types.
16971 if (ArgVT == MVT::f80) {
16972 llvm_unreachable("va_arg for f80 not yet implemented");
16973 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16974 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
16975 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16976 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
16977 } else {
16978 llvm_unreachable("Unhandled argument type in LowerVAARG");
16979 }
16980
16981 if (ArgMode == 2) {
16982 // Sanity Check: Make sure using fp_offset makes sense.
16983 assert(!Subtarget.useSoftFloat() &&
16984 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
16985 Subtarget.hasSSE1());
16986 }
16987
16988 // Insert VAARG_64 node into the DAG
16989 // VAARG_64 returns two values: Variable Argument Address, Chain
16990 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
16991 DAG.getConstant(ArgMode, dl, MVT::i8),
16992 DAG.getConstant(Align, dl, MVT::i32)};
16993 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
16994 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16995 VTs, InstOps, MVT::i64,
16996 MachinePointerInfo(SV),
16997 /*Align=*/0,
16998 /*Volatile=*/false,
16999 /*ReadMem=*/true,
17000 /*WriteMem=*/true);
17001 Chain = VAARG.getValue(1);
17002
17003 // Load the next argument and return it
17004 return DAG.getLoad(ArgVT, dl,
17005 Chain,
17006 VAARG,
17007 MachinePointerInfo(),
17008 false, false, false, 0);
17009 }
17010
LowerVACOPY(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)17011 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
17012 SelectionDAG &DAG) {
17013 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
17014 // where a va_list is still an i8*.
17015 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
17016 if (Subtarget.isCallingConvWin64(
17017 DAG.getMachineFunction().getFunction()->getCallingConv()))
17018 // Probably a Win64 va_copy.
17019 return DAG.expandVACopy(Op.getNode());
17020
17021 SDValue Chain = Op.getOperand(0);
17022 SDValue DstPtr = Op.getOperand(1);
17023 SDValue SrcPtr = Op.getOperand(2);
17024 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17025 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17026 SDLoc DL(Op);
17027
17028 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17029 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
17030 false, false,
17031 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17032 }
17033
17034 /// Handle vector element shifts where the shift amount is a constant.
17035 /// Takes immediate version of shift as input.
getTargetVShiftByConstNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,uint64_t ShiftAmt,SelectionDAG & DAG)17036 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
17037 SDValue SrcOp, uint64_t ShiftAmt,
17038 SelectionDAG &DAG) {
17039 MVT ElementType = VT.getVectorElementType();
17040
17041 // Fold this packed shift into its first operand if ShiftAmt is 0.
17042 if (ShiftAmt == 0)
17043 return SrcOp;
17044
17045 // Check for ShiftAmt >= element width
17046 if (ShiftAmt >= ElementType.getSizeInBits()) {
17047 if (Opc == X86ISD::VSRAI)
17048 ShiftAmt = ElementType.getSizeInBits() - 1;
17049 else
17050 return DAG.getConstant(0, dl, VT);
17051 }
17052
17053 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17054 && "Unknown target vector shift-by-constant node");
17055
17056 // Fold this packed vector shift into a build vector if SrcOp is a
17057 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17058 if (VT == SrcOp.getSimpleValueType() &&
17059 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17060 SmallVector<SDValue, 8> Elts;
17061 unsigned NumElts = SrcOp->getNumOperands();
17062 ConstantSDNode *ND;
17063
17064 switch(Opc) {
17065 default: llvm_unreachable("Unknown opcode!");
17066 case X86ISD::VSHLI:
17067 for (unsigned i=0; i!=NumElts; ++i) {
17068 SDValue CurrentOp = SrcOp->getOperand(i);
17069 if (CurrentOp->isUndef()) {
17070 Elts.push_back(CurrentOp);
17071 continue;
17072 }
17073 ND = cast<ConstantSDNode>(CurrentOp);
17074 const APInt &C = ND->getAPIntValue();
17075 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
17076 }
17077 break;
17078 case X86ISD::VSRLI:
17079 for (unsigned i=0; i!=NumElts; ++i) {
17080 SDValue CurrentOp = SrcOp->getOperand(i);
17081 if (CurrentOp->isUndef()) {
17082 Elts.push_back(CurrentOp);
17083 continue;
17084 }
17085 ND = cast<ConstantSDNode>(CurrentOp);
17086 const APInt &C = ND->getAPIntValue();
17087 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
17088 }
17089 break;
17090 case X86ISD::VSRAI:
17091 for (unsigned i=0; i!=NumElts; ++i) {
17092 SDValue CurrentOp = SrcOp->getOperand(i);
17093 if (CurrentOp->isUndef()) {
17094 Elts.push_back(CurrentOp);
17095 continue;
17096 }
17097 ND = cast<ConstantSDNode>(CurrentOp);
17098 const APInt &C = ND->getAPIntValue();
17099 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
17100 }
17101 break;
17102 }
17103
17104 return DAG.getBuildVector(VT, dl, Elts);
17105 }
17106
17107 return DAG.getNode(Opc, dl, VT, SrcOp,
17108 DAG.getConstant(ShiftAmt, dl, MVT::i8));
17109 }
17110
17111 /// Handle vector element shifts where the shift amount may or may not be a
17112 /// constant. Takes immediate version of shift as input.
getTargetVShiftNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,SDValue ShAmt,SelectionDAG & DAG)17113 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
17114 SDValue SrcOp, SDValue ShAmt,
17115 SelectionDAG &DAG) {
17116 MVT SVT = ShAmt.getSimpleValueType();
17117 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17118
17119 // Catch shift-by-constant.
17120 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17121 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17122 CShAmt->getZExtValue(), DAG);
17123
17124 // Change opcode to non-immediate version
17125 switch (Opc) {
17126 default: llvm_unreachable("Unknown target vector shift node");
17127 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17128 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17129 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17130 }
17131
17132 const X86Subtarget &Subtarget =
17133 static_cast<const X86Subtarget &>(DAG.getSubtarget());
17134 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17135 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17136 // Let the shuffle legalizer expand this shift amount node.
17137 SDValue Op0 = ShAmt.getOperand(0);
17138 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17139 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
17140 } else {
17141 // Need to build a vector containing shift amount.
17142 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17143 SmallVector<SDValue, 4> ShOps;
17144 ShOps.push_back(ShAmt);
17145 if (SVT == MVT::i32) {
17146 ShOps.push_back(DAG.getConstant(0, dl, SVT));
17147 ShOps.push_back(DAG.getUNDEF(SVT));
17148 }
17149 ShOps.push_back(DAG.getUNDEF(SVT));
17150
17151 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17152 ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
17153 }
17154
17155 // The return type has to be a 128-bit type with the same element
17156 // type as the input type.
17157 MVT EltVT = VT.getVectorElementType();
17158 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17159
17160 ShAmt = DAG.getBitcast(ShVT, ShAmt);
17161 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17162 }
17163
17164 /// \brief Return Mask with the necessary casting or extending
17165 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
getMaskNode(SDValue Mask,MVT MaskVT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)17166 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
17167 const X86Subtarget &Subtarget, SelectionDAG &DAG,
17168 const SDLoc &dl) {
17169
17170 if (isAllOnesConstant(Mask))
17171 return DAG.getTargetConstant(1, dl, MaskVT);
17172 if (X86::isZeroNode(Mask))
17173 return DAG.getTargetConstant(0, dl, MaskVT);
17174
17175 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
17176 // Mask should be extended
17177 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
17178 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
17179 }
17180
17181 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
17182 if (MaskVT == MVT::v64i1) {
17183 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
17184 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
17185 SDValue Lo, Hi;
17186 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17187 DAG.getConstant(0, dl, MVT::i32));
17188 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17189 DAG.getConstant(1, dl, MVT::i32));
17190
17191 Lo = DAG.getBitcast(MVT::v32i1, Lo);
17192 Hi = DAG.getBitcast(MVT::v32i1, Hi);
17193
17194 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
17195 } else {
17196 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
17197 // and bitcast.
17198 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
17199 return DAG.getBitcast(MaskVT,
17200 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
17201 }
17202
17203 } else {
17204 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17205 Mask.getSimpleValueType().getSizeInBits());
17206 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17207 // are extracted by EXTRACT_SUBVECTOR.
17208 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17209 DAG.getBitcast(BitcastVT, Mask),
17210 DAG.getIntPtrConstant(0, dl));
17211 }
17212 }
17213
17214 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17215 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17216 /// necessary casting or extending for \p Mask when lowering masking intrinsics
getVectorMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)17217 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17218 SDValue PreservedSrc,
17219 const X86Subtarget &Subtarget,
17220 SelectionDAG &DAG) {
17221 MVT VT = Op.getSimpleValueType();
17222 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17223 unsigned OpcodeSelect = ISD::VSELECT;
17224 SDLoc dl(Op);
17225
17226 if (isAllOnesConstant(Mask))
17227 return Op;
17228
17229 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17230
17231 switch (Op.getOpcode()) {
17232 default: break;
17233 case X86ISD::PCMPEQM:
17234 case X86ISD::PCMPGTM:
17235 case X86ISD::CMPM:
17236 case X86ISD::CMPMU:
17237 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17238 case X86ISD::VFPCLASS:
17239 case X86ISD::VFPCLASSS:
17240 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
17241 case X86ISD::VTRUNC:
17242 case X86ISD::VTRUNCS:
17243 case X86ISD::VTRUNCUS:
17244 case ISD::FP_TO_FP16:
17245 // We can't use ISD::VSELECT here because it is not always "Legal"
17246 // for the destination type. For example vpmovqb require only AVX512
17247 // and vselect that can operate on byte element type require BWI
17248 OpcodeSelect = X86ISD::SELECT;
17249 break;
17250 }
17251 if (PreservedSrc.isUndef())
17252 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17253 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
17254 }
17255
17256 /// \brief Creates an SDNode for a predicated scalar operation.
17257 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17258 /// The mask is coming as MVT::i8 and it should be truncated
17259 /// to MVT::i1 while lowering masking intrinsics.
17260 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17261 /// "X86select" instead of "vselect". We just can't create the "vselect" node
17262 /// for a scalar instruction.
getScalarMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)17263 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17264 SDValue PreservedSrc,
17265 const X86Subtarget &Subtarget,
17266 SelectionDAG &DAG) {
17267 if (isAllOnesConstant(Mask))
17268 return Op;
17269
17270 MVT VT = Op.getSimpleValueType();
17271 SDLoc dl(Op);
17272 // The mask should be of type MVT::i1
17273 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17274
17275 if (Op.getOpcode() == X86ISD::FSETCC)
17276 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
17277 if (Op.getOpcode() == X86ISD::VFPCLASS ||
17278 Op.getOpcode() == X86ISD::VFPCLASSS)
17279 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
17280
17281 if (PreservedSrc.isUndef())
17282 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17283 return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17284 }
17285
getSEHRegistrationNodeSize(const Function * Fn)17286 static int getSEHRegistrationNodeSize(const Function *Fn) {
17287 if (!Fn->hasPersonalityFn())
17288 report_fatal_error(
17289 "querying registration node size for function without personality");
17290 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
17291 // WinEHStatePass for the full struct definition.
17292 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
17293 case EHPersonality::MSVC_X86SEH: return 24;
17294 case EHPersonality::MSVC_CXX: return 16;
17295 default: break;
17296 }
17297 report_fatal_error(
17298 "can only recover FP for 32-bit MSVC EH personality functions");
17299 }
17300
17301 /// When the MSVC runtime transfers control to us, either to an outlined
17302 /// function or when returning to a parent frame after catching an exception, we
17303 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
17304 /// Here's the math:
17305 /// RegNodeBase = EntryEBP - RegNodeSize
17306 /// ParentFP = RegNodeBase - ParentFrameOffset
17307 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
17308 /// subtracting the offset (negative on x86) takes us back to the parent FP.
recoverFramePointer(SelectionDAG & DAG,const Function * Fn,SDValue EntryEBP)17309 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
17310 SDValue EntryEBP) {
17311 MachineFunction &MF = DAG.getMachineFunction();
17312 SDLoc dl;
17313
17314 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17315 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
17316
17317 // It's possible that the parent function no longer has a personality function
17318 // if the exceptional code was optimized away, in which case we just return
17319 // the incoming EBP.
17320 if (!Fn->hasPersonalityFn())
17321 return EntryEBP;
17322
17323 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
17324 // registration, or the .set_setframe offset.
17325 MCSymbol *OffsetSym =
17326 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
17327 GlobalValue::getRealLinkageName(Fn->getName()));
17328 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
17329 SDValue ParentFrameOffset =
17330 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
17331
17332 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
17333 // prologue to RBP in the parent function.
17334 const X86Subtarget &Subtarget =
17335 static_cast<const X86Subtarget &>(DAG.getSubtarget());
17336 if (Subtarget.is64Bit())
17337 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
17338
17339 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
17340 // RegNodeBase = EntryEBP - RegNodeSize
17341 // ParentFP = RegNodeBase - ParentFrameOffset
17342 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
17343 DAG.getConstant(RegNodeSize, dl, PtrVT));
17344 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
17345 }
17346
LowerINTRINSIC_WO_CHAIN(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)17347 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
17348 SelectionDAG &DAG) {
17349 SDLoc dl(Op);
17350 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17351 MVT VT = Op.getSimpleValueType();
17352 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17353 if (IntrData) {
17354 switch(IntrData->Type) {
17355 case INTR_TYPE_1OP:
17356 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17357 case INTR_TYPE_2OP:
17358 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17359 Op.getOperand(2));
17360 case INTR_TYPE_2OP_IMM8:
17361 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17362 DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
17363 case INTR_TYPE_3OP:
17364 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17365 Op.getOperand(2), Op.getOperand(3));
17366 case INTR_TYPE_4OP:
17367 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17368 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
17369 case INTR_TYPE_1OP_MASK_RM: {
17370 SDValue Src = Op.getOperand(1);
17371 SDValue PassThru = Op.getOperand(2);
17372 SDValue Mask = Op.getOperand(3);
17373 SDValue RoundingMode;
17374 // We allways add rounding mode to the Node.
17375 // If the rounding mode is not specified, we add the
17376 // "current direction" mode.
17377 if (Op.getNumOperands() == 4)
17378 RoundingMode =
17379 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17380 else
17381 RoundingMode = Op.getOperand(4);
17382 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17383 if (IntrWithRoundingModeOpcode != 0)
17384 if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
17385 X86::STATIC_ROUNDING::CUR_DIRECTION)
17386 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17387 dl, Op.getValueType(), Src, RoundingMode),
17388 Mask, PassThru, Subtarget, DAG);
17389 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17390 RoundingMode),
17391 Mask, PassThru, Subtarget, DAG);
17392 }
17393 case INTR_TYPE_1OP_MASK: {
17394 SDValue Src = Op.getOperand(1);
17395 SDValue PassThru = Op.getOperand(2);
17396 SDValue Mask = Op.getOperand(3);
17397 // We add rounding mode to the Node when
17398 // - RM Opcode is specified and
17399 // - RM is not "current direction".
17400 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17401 if (IntrWithRoundingModeOpcode != 0) {
17402 SDValue Rnd = Op.getOperand(4);
17403 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17404 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17405 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17406 dl, Op.getValueType(),
17407 Src, Rnd),
17408 Mask, PassThru, Subtarget, DAG);
17409 }
17410 }
17411 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17412 Mask, PassThru, Subtarget, DAG);
17413 }
17414 case INTR_TYPE_SCALAR_MASK: {
17415 SDValue Src1 = Op.getOperand(1);
17416 SDValue Src2 = Op.getOperand(2);
17417 SDValue passThru = Op.getOperand(3);
17418 SDValue Mask = Op.getOperand(4);
17419 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
17420 Mask, passThru, Subtarget, DAG);
17421 }
17422 case INTR_TYPE_SCALAR_MASK_RM: {
17423 SDValue Src1 = Op.getOperand(1);
17424 SDValue Src2 = Op.getOperand(2);
17425 SDValue Src0 = Op.getOperand(3);
17426 SDValue Mask = Op.getOperand(4);
17427 // There are 2 kinds of intrinsics in this group:
17428 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
17429 // (2) With rounding mode and sae - 7 operands.
17430 if (Op.getNumOperands() == 6) {
17431 SDValue Sae = Op.getOperand(5);
17432 unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
17433 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
17434 Sae),
17435 Mask, Src0, Subtarget, DAG);
17436 }
17437 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
17438 SDValue RoundingMode = Op.getOperand(5);
17439 SDValue Sae = Op.getOperand(6);
17440 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17441 RoundingMode, Sae),
17442 Mask, Src0, Subtarget, DAG);
17443 }
17444 case INTR_TYPE_2OP_MASK:
17445 case INTR_TYPE_2OP_IMM8_MASK: {
17446 SDValue Src1 = Op.getOperand(1);
17447 SDValue Src2 = Op.getOperand(2);
17448 SDValue PassThru = Op.getOperand(3);
17449 SDValue Mask = Op.getOperand(4);
17450
17451 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
17452 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
17453
17454 // We specify 2 possible opcodes for intrinsics with rounding modes.
17455 // First, we check if the intrinsic may have non-default rounding mode,
17456 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17457 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17458 if (IntrWithRoundingModeOpcode != 0) {
17459 SDValue Rnd = Op.getOperand(5);
17460 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17461 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17462 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17463 dl, Op.getValueType(),
17464 Src1, Src2, Rnd),
17465 Mask, PassThru, Subtarget, DAG);
17466 }
17467 }
17468 // TODO: Intrinsics should have fast-math-flags to propagate.
17469 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
17470 Mask, PassThru, Subtarget, DAG);
17471 }
17472 case INTR_TYPE_2OP_MASK_RM: {
17473 SDValue Src1 = Op.getOperand(1);
17474 SDValue Src2 = Op.getOperand(2);
17475 SDValue PassThru = Op.getOperand(3);
17476 SDValue Mask = Op.getOperand(4);
17477 // We specify 2 possible modes for intrinsics, with/without rounding
17478 // modes.
17479 // First, we check if the intrinsic have rounding mode (6 operands),
17480 // if not, we set rounding mode to "current".
17481 SDValue Rnd;
17482 if (Op.getNumOperands() == 6)
17483 Rnd = Op.getOperand(5);
17484 else
17485 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17486 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17487 Src1, Src2, Rnd),
17488 Mask, PassThru, Subtarget, DAG);
17489 }
17490 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
17491 SDValue Src1 = Op.getOperand(1);
17492 SDValue Src2 = Op.getOperand(2);
17493 SDValue Src3 = Op.getOperand(3);
17494 SDValue PassThru = Op.getOperand(4);
17495 SDValue Mask = Op.getOperand(5);
17496 SDValue Sae = Op.getOperand(6);
17497
17498 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
17499 Src2, Src3, Sae),
17500 Mask, PassThru, Subtarget, DAG);
17501 }
17502 case INTR_TYPE_3OP_MASK_RM: {
17503 SDValue Src1 = Op.getOperand(1);
17504 SDValue Src2 = Op.getOperand(2);
17505 SDValue Imm = Op.getOperand(3);
17506 SDValue PassThru = Op.getOperand(4);
17507 SDValue Mask = Op.getOperand(5);
17508 // We specify 2 possible modes for intrinsics, with/without rounding
17509 // modes.
17510 // First, we check if the intrinsic have rounding mode (7 operands),
17511 // if not, we set rounding mode to "current".
17512 SDValue Rnd;
17513 if (Op.getNumOperands() == 7)
17514 Rnd = Op.getOperand(6);
17515 else
17516 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17517 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17518 Src1, Src2, Imm, Rnd),
17519 Mask, PassThru, Subtarget, DAG);
17520 }
17521 case INTR_TYPE_3OP_IMM8_MASK:
17522 case INTR_TYPE_3OP_MASK:
17523 case INSERT_SUBVEC: {
17524 SDValue Src1 = Op.getOperand(1);
17525 SDValue Src2 = Op.getOperand(2);
17526 SDValue Src3 = Op.getOperand(3);
17527 SDValue PassThru = Op.getOperand(4);
17528 SDValue Mask = Op.getOperand(5);
17529
17530 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
17531 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
17532 else if (IntrData->Type == INSERT_SUBVEC) {
17533 // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
17534 assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
17535 unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
17536 Imm *= Src2.getSimpleValueType().getVectorNumElements();
17537 Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
17538 }
17539
17540 // We specify 2 possible opcodes for intrinsics with rounding modes.
17541 // First, we check if the intrinsic may have non-default rounding mode,
17542 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17543 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17544 if (IntrWithRoundingModeOpcode != 0) {
17545 SDValue Rnd = Op.getOperand(6);
17546 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17547 if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17548 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17549 dl, Op.getValueType(),
17550 Src1, Src2, Src3, Rnd),
17551 Mask, PassThru, Subtarget, DAG);
17552 }
17553 }
17554 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17555 Src1, Src2, Src3),
17556 Mask, PassThru, Subtarget, DAG);
17557 }
17558 case VPERM_2OP_MASK : {
17559 SDValue Src1 = Op.getOperand(1);
17560 SDValue Src2 = Op.getOperand(2);
17561 SDValue PassThru = Op.getOperand(3);
17562 SDValue Mask = Op.getOperand(4);
17563
17564 // Swap Src1 and Src2 in the node creation
17565 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
17566 Mask, PassThru, Subtarget, DAG);
17567 }
17568 case VPERM_3OP_MASKZ:
17569 case VPERM_3OP_MASK:{
17570 // Src2 is the PassThru
17571 SDValue Src1 = Op.getOperand(1);
17572 SDValue Src2 = Op.getOperand(2);
17573 SDValue Src3 = Op.getOperand(3);
17574 SDValue Mask = Op.getOperand(4);
17575 MVT VT = Op.getSimpleValueType();
17576 SDValue PassThru = SDValue();
17577
17578 // set PassThru element
17579 if (IntrData->Type == VPERM_3OP_MASKZ)
17580 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17581 else
17582 PassThru = DAG.getBitcast(VT, Src2);
17583
17584 // Swap Src1 and Src2 in the node creation
17585 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17586 dl, Op.getValueType(),
17587 Src2, Src1, Src3),
17588 Mask, PassThru, Subtarget, DAG);
17589 }
17590 case FMA_OP_MASK3:
17591 case FMA_OP_MASKZ:
17592 case FMA_OP_MASK: {
17593 SDValue Src1 = Op.getOperand(1);
17594 SDValue Src2 = Op.getOperand(2);
17595 SDValue Src3 = Op.getOperand(3);
17596 SDValue Mask = Op.getOperand(4);
17597 MVT VT = Op.getSimpleValueType();
17598 SDValue PassThru = SDValue();
17599
17600 // set PassThru element
17601 if (IntrData->Type == FMA_OP_MASKZ)
17602 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17603 else if (IntrData->Type == FMA_OP_MASK3)
17604 PassThru = Src3;
17605 else
17606 PassThru = Src1;
17607
17608 // We specify 2 possible opcodes for intrinsics with rounding modes.
17609 // First, we check if the intrinsic may have non-default rounding mode,
17610 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17611 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17612 if (IntrWithRoundingModeOpcode != 0) {
17613 SDValue Rnd = Op.getOperand(5);
17614 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17615 X86::STATIC_ROUNDING::CUR_DIRECTION)
17616 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17617 dl, Op.getValueType(),
17618 Src1, Src2, Src3, Rnd),
17619 Mask, PassThru, Subtarget, DAG);
17620 }
17621 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17622 dl, Op.getValueType(),
17623 Src1, Src2, Src3),
17624 Mask, PassThru, Subtarget, DAG);
17625 }
17626 case FMA_OP_SCALAR_MASK:
17627 case FMA_OP_SCALAR_MASK3:
17628 case FMA_OP_SCALAR_MASKZ: {
17629 SDValue Src1 = Op.getOperand(1);
17630 SDValue Src2 = Op.getOperand(2);
17631 SDValue Src3 = Op.getOperand(3);
17632 SDValue Mask = Op.getOperand(4);
17633 MVT VT = Op.getSimpleValueType();
17634 SDValue PassThru = SDValue();
17635
17636 // set PassThru element
17637 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
17638 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17639 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
17640 PassThru = Src3;
17641 else
17642 PassThru = Src1;
17643
17644 SDValue Rnd = Op.getOperand(5);
17645 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
17646 Op.getValueType(), Src1, Src2,
17647 Src3, Rnd),
17648 Mask, PassThru, Subtarget, DAG);
17649 }
17650 case TERLOG_OP_MASK:
17651 case TERLOG_OP_MASKZ: {
17652 SDValue Src1 = Op.getOperand(1);
17653 SDValue Src2 = Op.getOperand(2);
17654 SDValue Src3 = Op.getOperand(3);
17655 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
17656 SDValue Mask = Op.getOperand(5);
17657 MVT VT = Op.getSimpleValueType();
17658 SDValue PassThru = Src1;
17659 // Set PassThru element.
17660 if (IntrData->Type == TERLOG_OP_MASKZ)
17661 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17662
17663 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17664 Src1, Src2, Src3, Src4),
17665 Mask, PassThru, Subtarget, DAG);
17666 }
17667 case FPCLASS: {
17668 // FPclass intrinsics with mask
17669 SDValue Src1 = Op.getOperand(1);
17670 MVT VT = Src1.getSimpleValueType();
17671 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17672 SDValue Imm = Op.getOperand(2);
17673 SDValue Mask = Op.getOperand(3);
17674 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17675 Mask.getSimpleValueType().getSizeInBits());
17676 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
17677 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
17678 DAG.getTargetConstant(0, dl, MaskVT),
17679 Subtarget, DAG);
17680 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17681 DAG.getUNDEF(BitcastVT), FPclassMask,
17682 DAG.getIntPtrConstant(0, dl));
17683 return DAG.getBitcast(Op.getValueType(), Res);
17684 }
17685 case FPCLASSS: {
17686 SDValue Src1 = Op.getOperand(1);
17687 SDValue Imm = Op.getOperand(2);
17688 SDValue Mask = Op.getOperand(3);
17689 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
17690 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
17691 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
17692 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
17693 }
17694 case CMP_MASK:
17695 case CMP_MASK_CC: {
17696 // Comparison intrinsics with masks.
17697 // Example of transformation:
17698 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17699 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17700 // (i8 (bitcast
17701 // (v8i1 (insert_subvector undef,
17702 // (v2i1 (and (PCMPEQM %a, %b),
17703 // (extract_subvector
17704 // (v8i1 (bitcast %mask)), 0))), 0))))
17705 MVT VT = Op.getOperand(1).getSimpleValueType();
17706 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17707 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17708 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17709 Mask.getSimpleValueType().getSizeInBits());
17710 SDValue Cmp;
17711 if (IntrData->Type == CMP_MASK_CC) {
17712 SDValue CC = Op.getOperand(3);
17713 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
17714 // We specify 2 possible opcodes for intrinsics with rounding modes.
17715 // First, we check if the intrinsic may have non-default rounding mode,
17716 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17717 if (IntrData->Opc1 != 0) {
17718 SDValue Rnd = Op.getOperand(5);
17719 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17720 X86::STATIC_ROUNDING::CUR_DIRECTION)
17721 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
17722 Op.getOperand(2), CC, Rnd);
17723 }
17724 //default rounding mode
17725 if(!Cmp.getNode())
17726 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17727 Op.getOperand(2), CC);
17728
17729 } else {
17730 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17731 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17732 Op.getOperand(2));
17733 }
17734 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17735 DAG.getTargetConstant(0, dl,
17736 MaskVT),
17737 Subtarget, DAG);
17738 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17739 DAG.getUNDEF(BitcastVT), CmpMask,
17740 DAG.getIntPtrConstant(0, dl));
17741 return DAG.getBitcast(Op.getValueType(), Res);
17742 }
17743 case CMP_MASK_SCALAR_CC: {
17744 SDValue Src1 = Op.getOperand(1);
17745 SDValue Src2 = Op.getOperand(2);
17746 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
17747 SDValue Mask = Op.getOperand(4);
17748
17749 SDValue Cmp;
17750 if (IntrData->Opc1 != 0) {
17751 SDValue Rnd = Op.getOperand(5);
17752 if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17753 X86::STATIC_ROUNDING::CUR_DIRECTION)
17754 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
17755 }
17756 //default rounding mode
17757 if(!Cmp.getNode())
17758 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
17759
17760 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
17761 DAG.getTargetConstant(0, dl,
17762 MVT::i1),
17763 Subtarget, DAG);
17764
17765 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
17766 }
17767 case COMI: { // Comparison intrinsics
17768 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17769 SDValue LHS = Op.getOperand(1);
17770 SDValue RHS = Op.getOperand(2);
17771 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17772 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
17773 SDValue SetCC;
17774 switch (CC) {
17775 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
17776 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17777 DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
17778 SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17779 DAG.getConstant(X86::COND_NP, dl, MVT::i8),
17780 Comi);
17781 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
17782 break;
17783 }
17784 case ISD::SETNE: { // (ZF = 1 or PF = 1)
17785 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17786 DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
17787 SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17788 DAG.getConstant(X86::COND_P, dl, MVT::i8),
17789 Comi);
17790 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
17791 break;
17792 }
17793 case ISD::SETGT: // (CF = 0 and ZF = 0)
17794 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17795 DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
17796 break;
17797 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
17798 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17799 DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
17800 break;
17801 }
17802 case ISD::SETGE: // CF = 0
17803 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17804 DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
17805 break;
17806 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
17807 SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17808 DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
17809 break;
17810 default:
17811 llvm_unreachable("Unexpected illegal condition!");
17812 }
17813 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17814 }
17815 case COMI_RM: { // Comparison intrinsics with Sae
17816 SDValue LHS = Op.getOperand(1);
17817 SDValue RHS = Op.getOperand(2);
17818 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
17819 SDValue Sae = Op.getOperand(4);
17820
17821 SDValue FCmp;
17822 if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
17823 X86::STATIC_ROUNDING::CUR_DIRECTION)
17824 FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17825 DAG.getConstant(CondVal, dl, MVT::i8));
17826 else
17827 FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17828 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
17829 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
17830 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
17831 }
17832 case VSHIFT:
17833 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17834 Op.getOperand(1), Op.getOperand(2), DAG);
17835 case COMPRESS_EXPAND_IN_REG: {
17836 SDValue Mask = Op.getOperand(3);
17837 SDValue DataToCompress = Op.getOperand(1);
17838 SDValue PassThru = Op.getOperand(2);
17839 if (isAllOnesConstant(Mask)) // return data as is
17840 return Op.getOperand(1);
17841
17842 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17843 DataToCompress),
17844 Mask, PassThru, Subtarget, DAG);
17845 }
17846 case BROADCASTM: {
17847 SDValue Mask = Op.getOperand(1);
17848 MVT MaskVT = MVT::getVectorVT(MVT::i1,
17849 Mask.getSimpleValueType().getSizeInBits());
17850 Mask = DAG.getBitcast(MaskVT, Mask);
17851 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
17852 }
17853 case KUNPCK: {
17854 MVT VT = Op.getSimpleValueType();
17855 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
17856
17857 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
17858 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
17859 // Arguments should be swapped.
17860 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
17861 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
17862 Src2, Src1);
17863 return DAG.getBitcast(VT, Res);
17864 }
17865 case FIXUPIMMS:
17866 case FIXUPIMMS_MASKZ:
17867 case FIXUPIMM:
17868 case FIXUPIMM_MASKZ:{
17869 SDValue Src1 = Op.getOperand(1);
17870 SDValue Src2 = Op.getOperand(2);
17871 SDValue Src3 = Op.getOperand(3);
17872 SDValue Imm = Op.getOperand(4);
17873 SDValue Mask = Op.getOperand(5);
17874 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
17875 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
17876 // We specify 2 possible modes for intrinsics, with/without rounding
17877 // modes.
17878 // First, we check if the intrinsic have rounding mode (7 operands),
17879 // if not, we set rounding mode to "current".
17880 SDValue Rnd;
17881 if (Op.getNumOperands() == 7)
17882 Rnd = Op.getOperand(6);
17883 else
17884 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17885 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
17886 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17887 Src1, Src2, Src3, Imm, Rnd),
17888 Mask, Passthru, Subtarget, DAG);
17889 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
17890 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17891 Src1, Src2, Src3, Imm, Rnd),
17892 Mask, Passthru, Subtarget, DAG);
17893 }
17894 case CONVERT_TO_MASK: {
17895 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
17896 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
17897 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
17898
17899 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
17900 Op.getOperand(1));
17901 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17902 DAG.getUNDEF(BitcastVT), CvtMask,
17903 DAG.getIntPtrConstant(0, dl));
17904 return DAG.getBitcast(Op.getValueType(), Res);
17905 }
17906 case CONVERT_MASK_TO_VEC: {
17907 SDValue Mask = Op.getOperand(1);
17908 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17909 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17910 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
17911 }
17912 case BRCST_SUBVEC_TO_VEC: {
17913 SDValue Src = Op.getOperand(1);
17914 SDValue Passthru = Op.getOperand(2);
17915 SDValue Mask = Op.getOperand(3);
17916 EVT resVT = Passthru.getValueType();
17917 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
17918 DAG.getUNDEF(resVT), Src,
17919 DAG.getIntPtrConstant(0, dl));
17920 SDValue immVal;
17921 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
17922 immVal = DAG.getConstant(0x44, dl, MVT::i8);
17923 else
17924 immVal = DAG.getConstant(0, dl, MVT::i8);
17925 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17926 subVec, subVec, immVal),
17927 Mask, Passthru, Subtarget, DAG);
17928 }
17929 case BRCST32x2_TO_VEC: {
17930 SDValue Src = Op.getOperand(1);
17931 SDValue PassThru = Op.getOperand(2);
17932 SDValue Mask = Op.getOperand(3);
17933
17934 assert((VT.getScalarType() == MVT::i32 ||
17935 VT.getScalarType() == MVT::f32) && "Unexpected type!");
17936 //bitcast Src to packed 64
17937 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
17938 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
17939 Src = DAG.getBitcast(BitcastVT, Src);
17940
17941 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17942 Mask, PassThru, Subtarget, DAG);
17943 }
17944 default:
17945 break;
17946 }
17947 }
17948
17949 switch (IntNo) {
17950 default: return SDValue(); // Don't custom lower most intrinsics.
17951
17952 case Intrinsic::x86_avx2_permd:
17953 case Intrinsic::x86_avx2_permps:
17954 // Operands intentionally swapped. Mask is last operand to intrinsic,
17955 // but second operand for node/instruction.
17956 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
17957 Op.getOperand(2), Op.getOperand(1));
17958
17959 // ptest and testp intrinsics. The intrinsic these come from are designed to
17960 // return an integer value, not just an instruction so lower it to the ptest
17961 // or testp pattern and a setcc for the result.
17962 case Intrinsic::x86_sse41_ptestz:
17963 case Intrinsic::x86_sse41_ptestc:
17964 case Intrinsic::x86_sse41_ptestnzc:
17965 case Intrinsic::x86_avx_ptestz_256:
17966 case Intrinsic::x86_avx_ptestc_256:
17967 case Intrinsic::x86_avx_ptestnzc_256:
17968 case Intrinsic::x86_avx_vtestz_ps:
17969 case Intrinsic::x86_avx_vtestc_ps:
17970 case Intrinsic::x86_avx_vtestnzc_ps:
17971 case Intrinsic::x86_avx_vtestz_pd:
17972 case Intrinsic::x86_avx_vtestc_pd:
17973 case Intrinsic::x86_avx_vtestnzc_pd:
17974 case Intrinsic::x86_avx_vtestz_ps_256:
17975 case Intrinsic::x86_avx_vtestc_ps_256:
17976 case Intrinsic::x86_avx_vtestnzc_ps_256:
17977 case Intrinsic::x86_avx_vtestz_pd_256:
17978 case Intrinsic::x86_avx_vtestc_pd_256:
17979 case Intrinsic::x86_avx_vtestnzc_pd_256: {
17980 bool IsTestPacked = false;
17981 unsigned X86CC;
17982 switch (IntNo) {
17983 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17984 case Intrinsic::x86_avx_vtestz_ps:
17985 case Intrinsic::x86_avx_vtestz_pd:
17986 case Intrinsic::x86_avx_vtestz_ps_256:
17987 case Intrinsic::x86_avx_vtestz_pd_256:
17988 IsTestPacked = true; // Fallthrough
17989 case Intrinsic::x86_sse41_ptestz:
17990 case Intrinsic::x86_avx_ptestz_256:
17991 // ZF = 1
17992 X86CC = X86::COND_E;
17993 break;
17994 case Intrinsic::x86_avx_vtestc_ps:
17995 case Intrinsic::x86_avx_vtestc_pd:
17996 case Intrinsic::x86_avx_vtestc_ps_256:
17997 case Intrinsic::x86_avx_vtestc_pd_256:
17998 IsTestPacked = true; // Fallthrough
17999 case Intrinsic::x86_sse41_ptestc:
18000 case Intrinsic::x86_avx_ptestc_256:
18001 // CF = 1
18002 X86CC = X86::COND_B;
18003 break;
18004 case Intrinsic::x86_avx_vtestnzc_ps:
18005 case Intrinsic::x86_avx_vtestnzc_pd:
18006 case Intrinsic::x86_avx_vtestnzc_ps_256:
18007 case Intrinsic::x86_avx_vtestnzc_pd_256:
18008 IsTestPacked = true; // Fallthrough
18009 case Intrinsic::x86_sse41_ptestnzc:
18010 case Intrinsic::x86_avx_ptestnzc_256:
18011 // ZF and CF = 0
18012 X86CC = X86::COND_A;
18013 break;
18014 }
18015
18016 SDValue LHS = Op.getOperand(1);
18017 SDValue RHS = Op.getOperand(2);
18018 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
18019 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
18020 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18021 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18022 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18023 }
18024 case Intrinsic::x86_avx512_kortestz_w:
18025 case Intrinsic::x86_avx512_kortestc_w: {
18026 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
18027 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
18028 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
18029 SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18030 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18031 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18032 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18033 }
18034
18035 case Intrinsic::x86_sse42_pcmpistria128:
18036 case Intrinsic::x86_sse42_pcmpestria128:
18037 case Intrinsic::x86_sse42_pcmpistric128:
18038 case Intrinsic::x86_sse42_pcmpestric128:
18039 case Intrinsic::x86_sse42_pcmpistrio128:
18040 case Intrinsic::x86_sse42_pcmpestrio128:
18041 case Intrinsic::x86_sse42_pcmpistris128:
18042 case Intrinsic::x86_sse42_pcmpestris128:
18043 case Intrinsic::x86_sse42_pcmpistriz128:
18044 case Intrinsic::x86_sse42_pcmpestriz128: {
18045 unsigned Opcode;
18046 unsigned X86CC;
18047 switch (IntNo) {
18048 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
18049 case Intrinsic::x86_sse42_pcmpistria128:
18050 Opcode = X86ISD::PCMPISTRI;
18051 X86CC = X86::COND_A;
18052 break;
18053 case Intrinsic::x86_sse42_pcmpestria128:
18054 Opcode = X86ISD::PCMPESTRI;
18055 X86CC = X86::COND_A;
18056 break;
18057 case Intrinsic::x86_sse42_pcmpistric128:
18058 Opcode = X86ISD::PCMPISTRI;
18059 X86CC = X86::COND_B;
18060 break;
18061 case Intrinsic::x86_sse42_pcmpestric128:
18062 Opcode = X86ISD::PCMPESTRI;
18063 X86CC = X86::COND_B;
18064 break;
18065 case Intrinsic::x86_sse42_pcmpistrio128:
18066 Opcode = X86ISD::PCMPISTRI;
18067 X86CC = X86::COND_O;
18068 break;
18069 case Intrinsic::x86_sse42_pcmpestrio128:
18070 Opcode = X86ISD::PCMPESTRI;
18071 X86CC = X86::COND_O;
18072 break;
18073 case Intrinsic::x86_sse42_pcmpistris128:
18074 Opcode = X86ISD::PCMPISTRI;
18075 X86CC = X86::COND_S;
18076 break;
18077 case Intrinsic::x86_sse42_pcmpestris128:
18078 Opcode = X86ISD::PCMPESTRI;
18079 X86CC = X86::COND_S;
18080 break;
18081 case Intrinsic::x86_sse42_pcmpistriz128:
18082 Opcode = X86ISD::PCMPISTRI;
18083 X86CC = X86::COND_E;
18084 break;
18085 case Intrinsic::x86_sse42_pcmpestriz128:
18086 Opcode = X86ISD::PCMPESTRI;
18087 X86CC = X86::COND_E;
18088 break;
18089 }
18090 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18091 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18092 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
18093 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18094 DAG.getConstant(X86CC, dl, MVT::i8),
18095 SDValue(PCMP.getNode(), 1));
18096 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18097 }
18098
18099 case Intrinsic::x86_sse42_pcmpistri128:
18100 case Intrinsic::x86_sse42_pcmpestri128: {
18101 unsigned Opcode;
18102 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
18103 Opcode = X86ISD::PCMPISTRI;
18104 else
18105 Opcode = X86ISD::PCMPESTRI;
18106
18107 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18108 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18109 return DAG.getNode(Opcode, dl, VTs, NewOps);
18110 }
18111
18112 case Intrinsic::eh_sjlj_lsda: {
18113 MachineFunction &MF = DAG.getMachineFunction();
18114 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18115 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18116 auto &Context = MF.getMMI().getContext();
18117 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
18118 Twine(MF.getFunctionNumber()));
18119 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
18120 }
18121
18122 case Intrinsic::x86_seh_lsda: {
18123 // Compute the symbol for the LSDA. We know it'll get emitted later.
18124 MachineFunction &MF = DAG.getMachineFunction();
18125 SDValue Op1 = Op.getOperand(1);
18126 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
18127 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
18128 GlobalValue::getRealLinkageName(Fn->getName()));
18129
18130 // Generate a simple absolute symbol reference. This intrinsic is only
18131 // supported on 32-bit Windows, which isn't PIC.
18132 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
18133 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
18134 }
18135
18136 case Intrinsic::x86_seh_recoverfp: {
18137 SDValue FnOp = Op.getOperand(1);
18138 SDValue IncomingFPOp = Op.getOperand(2);
18139 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
18140 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
18141 if (!Fn)
18142 report_fatal_error(
18143 "llvm.x86.seh.recoverfp must take a function as the first argument");
18144 return recoverFramePointer(DAG, Fn, IncomingFPOp);
18145 }
18146
18147 case Intrinsic::localaddress: {
18148 // Returns one of the stack, base, or frame pointer registers, depending on
18149 // which is used to reference local variables.
18150 MachineFunction &MF = DAG.getMachineFunction();
18151 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18152 unsigned Reg;
18153 if (RegInfo->hasBasePointer(MF))
18154 Reg = RegInfo->getBaseRegister();
18155 else // This function handles the SP or FP case.
18156 Reg = RegInfo->getPtrSizedFrameRegister(MF);
18157 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
18158 }
18159 }
18160 }
18161
getGatherNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)18162 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18163 SDValue Src, SDValue Mask, SDValue Base,
18164 SDValue Index, SDValue ScaleOp, SDValue Chain,
18165 const X86Subtarget &Subtarget) {
18166 SDLoc dl(Op);
18167 auto *C = cast<ConstantSDNode>(ScaleOp);
18168 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18169 MVT MaskVT = MVT::getVectorVT(MVT::i1,
18170 Index.getSimpleValueType().getVectorNumElements());
18171
18172 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18173 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
18174 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18175 SDValue Segment = DAG.getRegister(0, MVT::i32);
18176 if (Src.isUndef())
18177 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
18178 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
18179 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18180 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
18181 return DAG.getMergeValues(RetOps, dl);
18182 }
18183
getScatterNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)18184 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18185 SDValue Src, SDValue Mask, SDValue Base,
18186 SDValue Index, SDValue ScaleOp, SDValue Chain,
18187 const X86Subtarget &Subtarget) {
18188 SDLoc dl(Op);
18189 auto *C = cast<ConstantSDNode>(ScaleOp);
18190 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18191 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18192 SDValue Segment = DAG.getRegister(0, MVT::i32);
18193 MVT MaskVT = MVT::getVectorVT(MVT::i1,
18194 Index.getSimpleValueType().getVectorNumElements());
18195
18196 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18197 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
18198 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
18199 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18200 return SDValue(Res, 1);
18201 }
18202
getPrefetchNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)18203 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18204 SDValue Mask, SDValue Base, SDValue Index,
18205 SDValue ScaleOp, SDValue Chain,
18206 const X86Subtarget &Subtarget) {
18207 SDLoc dl(Op);
18208 auto *C = cast<ConstantSDNode>(ScaleOp);
18209 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18210 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18211 SDValue Segment = DAG.getRegister(0, MVT::i32);
18212 MVT MaskVT =
18213 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
18214 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18215 //SDVTList VTs = DAG.getVTList(MVT::Other);
18216 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
18217 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
18218 return SDValue(Res, 0);
18219 }
18220
18221 /// Handles the lowering of builtin intrinsics that read performance monitor
18222 /// counters (x86_rdpmc).
getReadPerformanceCounter(SDNode * N,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)18223 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
18224 SelectionDAG &DAG,
18225 const X86Subtarget &Subtarget,
18226 SmallVectorImpl<SDValue> &Results) {
18227 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18228 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18229 SDValue LO, HI;
18230
18231 // The ECX register is used to select the index of the performance counter
18232 // to read.
18233 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
18234 N->getOperand(2));
18235 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
18236
18237 // Reads the content of a 64-bit performance counter and returns it in the
18238 // registers EDX:EAX.
18239 if (Subtarget.is64Bit()) {
18240 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18241 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18242 LO.getValue(2));
18243 } else {
18244 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18245 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18246 LO.getValue(2));
18247 }
18248 Chain = HI.getValue(1);
18249
18250 if (Subtarget.is64Bit()) {
18251 // The EAX register is loaded with the low-order 32 bits. The EDX register
18252 // is loaded with the supported high-order bits of the counter.
18253 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18254 DAG.getConstant(32, DL, MVT::i8));
18255 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18256 Results.push_back(Chain);
18257 return;
18258 }
18259
18260 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18261 SDValue Ops[] = { LO, HI };
18262 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18263 Results.push_back(Pair);
18264 Results.push_back(Chain);
18265 }
18266
18267 /// Handles the lowering of builtin intrinsics that read the time stamp counter
18268 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
18269 /// READCYCLECOUNTER nodes.
getReadTimeStampCounter(SDNode * N,const SDLoc & DL,unsigned Opcode,SelectionDAG & DAG,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)18270 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
18271 SelectionDAG &DAG,
18272 const X86Subtarget &Subtarget,
18273 SmallVectorImpl<SDValue> &Results) {
18274 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18275 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
18276 SDValue LO, HI;
18277
18278 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
18279 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
18280 // and the EAX register is loaded with the low-order 32 bits.
18281 if (Subtarget.is64Bit()) {
18282 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18283 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18284 LO.getValue(2));
18285 } else {
18286 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18287 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18288 LO.getValue(2));
18289 }
18290 SDValue Chain = HI.getValue(1);
18291
18292 if (Opcode == X86ISD::RDTSCP_DAG) {
18293 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18294
18295 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
18296 // the ECX register. Add 'ecx' explicitly to the chain.
18297 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
18298 HI.getValue(2));
18299 // Explicitly store the content of ECX at the location passed in input
18300 // to the 'rdtscp' intrinsic.
18301 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
18302 MachinePointerInfo(), false, false, 0);
18303 }
18304
18305 if (Subtarget.is64Bit()) {
18306 // The EDX register is loaded with the high-order 32 bits of the MSR, and
18307 // the EAX register is loaded with the low-order 32 bits.
18308 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18309 DAG.getConstant(32, DL, MVT::i8));
18310 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18311 Results.push_back(Chain);
18312 return;
18313 }
18314
18315 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18316 SDValue Ops[] = { LO, HI };
18317 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18318 Results.push_back(Pair);
18319 Results.push_back(Chain);
18320 }
18321
LowerREADCYCLECOUNTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18322 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
18323 SelectionDAG &DAG) {
18324 SmallVector<SDValue, 2> Results;
18325 SDLoc DL(Op);
18326 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
18327 Results);
18328 return DAG.getMergeValues(Results, DL);
18329 }
18330
MarkEHRegistrationNode(SDValue Op,SelectionDAG & DAG)18331 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
18332 MachineFunction &MF = DAG.getMachineFunction();
18333 SDValue Chain = Op.getOperand(0);
18334 SDValue RegNode = Op.getOperand(2);
18335 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18336 if (!EHInfo)
18337 report_fatal_error("EH registrations only live in functions using WinEH");
18338
18339 // Cast the operand to an alloca, and remember the frame index.
18340 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
18341 if (!FINode)
18342 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
18343 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
18344
18345 // Return the chain operand without making any DAG nodes.
18346 return Chain;
18347 }
18348
MarkEHGuard(SDValue Op,SelectionDAG & DAG)18349 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
18350 MachineFunction &MF = DAG.getMachineFunction();
18351 SDValue Chain = Op.getOperand(0);
18352 SDValue EHGuard = Op.getOperand(2);
18353 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18354 if (!EHInfo)
18355 report_fatal_error("EHGuard only live in functions using WinEH");
18356
18357 // Cast the operand to an alloca, and remember the frame index.
18358 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
18359 if (!FINode)
18360 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
18361 EHInfo->EHGuardFrameIndex = FINode->getIndex();
18362
18363 // Return the chain operand without making any DAG nodes.
18364 return Chain;
18365 }
18366
LowerINTRINSIC_W_CHAIN(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18367 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18368 SelectionDAG &DAG) {
18369 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
18370
18371 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
18372 if (!IntrData) {
18373 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
18374 return MarkEHRegistrationNode(Op, DAG);
18375 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
18376 return MarkEHGuard(Op, DAG);
18377 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
18378 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
18379 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
18380 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
18381 // We need a frame pointer because this will get lowered to a PUSH/POP
18382 // sequence.
18383 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18384 MFI->setHasCopyImplyingStackAdjustment(true);
18385 // Don't do anything here, we will expand these intrinsics out later
18386 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
18387 return SDValue();
18388 }
18389 return SDValue();
18390 }
18391
18392 SDLoc dl(Op);
18393 switch(IntrData->Type) {
18394 default: llvm_unreachable("Unknown Intrinsic Type");
18395 case RDSEED:
18396 case RDRAND: {
18397 // Emit the node with the right value type.
18398 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
18399 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18400
18401 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
18402 // Otherwise return the value from Rand, which is always 0, casted to i32.
18403 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
18404 DAG.getConstant(1, dl, Op->getValueType(1)),
18405 DAG.getConstant(X86::COND_B, dl, MVT::i32),
18406 SDValue(Result.getNode(), 1) };
18407 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
18408 DAG.getVTList(Op->getValueType(1), MVT::Glue),
18409 Ops);
18410
18411 // Return { result, isValid, chain }.
18412 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
18413 SDValue(Result.getNode(), 2));
18414 }
18415 case GATHER: {
18416 //gather(v1, mask, index, base, scale);
18417 SDValue Chain = Op.getOperand(0);
18418 SDValue Src = Op.getOperand(2);
18419 SDValue Base = Op.getOperand(3);
18420 SDValue Index = Op.getOperand(4);
18421 SDValue Mask = Op.getOperand(5);
18422 SDValue Scale = Op.getOperand(6);
18423 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
18424 Chain, Subtarget);
18425 }
18426 case SCATTER: {
18427 //scatter(base, mask, index, v1, scale);
18428 SDValue Chain = Op.getOperand(0);
18429 SDValue Base = Op.getOperand(2);
18430 SDValue Mask = Op.getOperand(3);
18431 SDValue Index = Op.getOperand(4);
18432 SDValue Src = Op.getOperand(5);
18433 SDValue Scale = Op.getOperand(6);
18434 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
18435 Scale, Chain, Subtarget);
18436 }
18437 case PREFETCH: {
18438 SDValue Hint = Op.getOperand(6);
18439 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
18440 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
18441 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
18442 SDValue Chain = Op.getOperand(0);
18443 SDValue Mask = Op.getOperand(2);
18444 SDValue Index = Op.getOperand(3);
18445 SDValue Base = Op.getOperand(4);
18446 SDValue Scale = Op.getOperand(5);
18447 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
18448 Subtarget);
18449 }
18450 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
18451 case RDTSC: {
18452 SmallVector<SDValue, 2> Results;
18453 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
18454 Results);
18455 return DAG.getMergeValues(Results, dl);
18456 }
18457 // Read Performance Monitoring Counters.
18458 case RDPMC: {
18459 SmallVector<SDValue, 2> Results;
18460 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
18461 return DAG.getMergeValues(Results, dl);
18462 }
18463 // XTEST intrinsics.
18464 case XTEST: {
18465 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18466 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18467 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18468 DAG.getConstant(X86::COND_NE, dl, MVT::i8),
18469 InTrans);
18470 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
18471 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
18472 Ret, SDValue(InTrans.getNode(), 1));
18473 }
18474 // ADC/ADCX/SBB
18475 case ADX: {
18476 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18477 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
18478 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
18479 DAG.getConstant(-1, dl, MVT::i8));
18480 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
18481 Op.getOperand(4), GenCF.getValue(1));
18482 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
18483 Op.getOperand(5), MachinePointerInfo(),
18484 false, false, 0);
18485 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18486 DAG.getConstant(X86::COND_B, dl, MVT::i8),
18487 Res.getValue(1));
18488 SDValue Results[] = { SetCC, Store };
18489 return DAG.getMergeValues(Results, dl);
18490 }
18491 case COMPRESS_TO_MEM: {
18492 SDValue Mask = Op.getOperand(4);
18493 SDValue DataToCompress = Op.getOperand(3);
18494 SDValue Addr = Op.getOperand(2);
18495 SDValue Chain = Op.getOperand(0);
18496 MVT VT = DataToCompress.getSimpleValueType();
18497
18498 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18499 assert(MemIntr && "Expected MemIntrinsicSDNode!");
18500
18501 if (isAllOnesConstant(Mask)) // return just a store
18502 return DAG.getStore(Chain, dl, DataToCompress, Addr,
18503 MemIntr->getMemOperand());
18504
18505 SDValue Compressed =
18506 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
18507 Mask, DAG.getUNDEF(VT), Subtarget, DAG);
18508 return DAG.getStore(Chain, dl, Compressed, Addr,
18509 MemIntr->getMemOperand());
18510 }
18511 case TRUNCATE_TO_MEM_VI8:
18512 case TRUNCATE_TO_MEM_VI16:
18513 case TRUNCATE_TO_MEM_VI32: {
18514 SDValue Mask = Op.getOperand(4);
18515 SDValue DataToTruncate = Op.getOperand(3);
18516 SDValue Addr = Op.getOperand(2);
18517 SDValue Chain = Op.getOperand(0);
18518
18519 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18520 assert(MemIntr && "Expected MemIntrinsicSDNode!");
18521
18522 EVT VT = MemIntr->getMemoryVT();
18523
18524 if (isAllOnesConstant(Mask)) // return just a truncate store
18525 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
18526 MemIntr->getMemOperand());
18527
18528 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18529 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18530
18531 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
18532 MemIntr->getMemOperand(), true);
18533 }
18534 case EXPAND_FROM_MEM: {
18535 SDValue Mask = Op.getOperand(4);
18536 SDValue PassThru = Op.getOperand(3);
18537 SDValue Addr = Op.getOperand(2);
18538 SDValue Chain = Op.getOperand(0);
18539 MVT VT = Op.getSimpleValueType();
18540
18541 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18542 assert(MemIntr && "Expected MemIntrinsicSDNode!");
18543
18544 SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
18545 MemIntr->getMemOperand());
18546
18547 if (isAllOnesConstant(Mask)) // return just a load
18548 return DataToExpand;
18549
18550 SDValue Results[] = {
18551 getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
18552 Mask, PassThru, Subtarget, DAG), Chain};
18553 return DAG.getMergeValues(Results, dl);
18554 }
18555 }
18556 }
18557
LowerRETURNADDR(SDValue Op,SelectionDAG & DAG) const18558 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
18559 SelectionDAG &DAG) const {
18560 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18561 MFI->setReturnAddressIsTaken(true);
18562
18563 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
18564 return SDValue();
18565
18566 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18567 SDLoc dl(Op);
18568 EVT PtrVT = getPointerTy(DAG.getDataLayout());
18569
18570 if (Depth > 0) {
18571 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
18572 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18573 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
18574 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18575 DAG.getNode(ISD::ADD, dl, PtrVT,
18576 FrameAddr, Offset),
18577 MachinePointerInfo(), false, false, false, 0);
18578 }
18579
18580 // Just load the return address.
18581 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
18582 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18583 RetAddrFI, MachinePointerInfo(), false, false, false, 0);
18584 }
18585
LowerFRAMEADDR(SDValue Op,SelectionDAG & DAG) const18586 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
18587 MachineFunction &MF = DAG.getMachineFunction();
18588 MachineFrameInfo *MFI = MF.getFrameInfo();
18589 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18590 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18591 EVT VT = Op.getValueType();
18592
18593 MFI->setFrameAddressIsTaken(true);
18594
18595 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
18596 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
18597 // is not possible to crawl up the stack without looking at the unwind codes
18598 // simultaneously.
18599 int FrameAddrIndex = FuncInfo->getFAIndex();
18600 if (!FrameAddrIndex) {
18601 // Set up a frame object for the return address.
18602 unsigned SlotSize = RegInfo->getSlotSize();
18603 FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
18604 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
18605 FuncInfo->setFAIndex(FrameAddrIndex);
18606 }
18607 return DAG.getFrameIndex(FrameAddrIndex, VT);
18608 }
18609
18610 unsigned FrameReg =
18611 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18612 SDLoc dl(Op); // FIXME probably not meaningful
18613 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18614 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
18615 (FrameReg == X86::EBP && VT == MVT::i32)) &&
18616 "Invalid Frame Register!");
18617 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
18618 while (Depth--)
18619 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
18620 MachinePointerInfo(),
18621 false, false, false, 0);
18622 return FrameAddr;
18623 }
18624
18625 // FIXME? Maybe this could be a TableGen attribute on some registers and
18626 // this table could be generated automatically from RegInfo.
getRegisterByName(const char * RegName,EVT VT,SelectionDAG & DAG) const18627 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
18628 SelectionDAG &DAG) const {
18629 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18630 const MachineFunction &MF = DAG.getMachineFunction();
18631
18632 unsigned Reg = StringSwitch<unsigned>(RegName)
18633 .Case("esp", X86::ESP)
18634 .Case("rsp", X86::RSP)
18635 .Case("ebp", X86::EBP)
18636 .Case("rbp", X86::RBP)
18637 .Default(0);
18638
18639 if (Reg == X86::EBP || Reg == X86::RBP) {
18640 if (!TFI.hasFP(MF))
18641 report_fatal_error("register " + StringRef(RegName) +
18642 " is allocatable: function has no frame pointer");
18643 #ifndef NDEBUG
18644 else {
18645 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18646 unsigned FrameReg =
18647 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18648 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
18649 "Invalid Frame Register!");
18650 }
18651 #endif
18652 }
18653
18654 if (Reg)
18655 return Reg;
18656
18657 report_fatal_error("Invalid register name global variable");
18658 }
18659
LowerFRAME_TO_ARGS_OFFSET(SDValue Op,SelectionDAG & DAG) const18660 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18661 SelectionDAG &DAG) const {
18662 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18663 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
18664 }
18665
getExceptionPointerRegister(const Constant * PersonalityFn) const18666 unsigned X86TargetLowering::getExceptionPointerRegister(
18667 const Constant *PersonalityFn) const {
18668 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
18669 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18670
18671 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
18672 }
18673
getExceptionSelectorRegister(const Constant * PersonalityFn) const18674 unsigned X86TargetLowering::getExceptionSelectorRegister(
18675 const Constant *PersonalityFn) const {
18676 // Funclet personalities don't use selectors (the runtime does the selection).
18677 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
18678 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18679 }
18680
needsFixedCatchObjects() const18681 bool X86TargetLowering::needsFixedCatchObjects() const {
18682 return Subtarget.isTargetWin64();
18683 }
18684
LowerEH_RETURN(SDValue Op,SelectionDAG & DAG) const18685 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18686 SDValue Chain = Op.getOperand(0);
18687 SDValue Offset = Op.getOperand(1);
18688 SDValue Handler = Op.getOperand(2);
18689 SDLoc dl (Op);
18690
18691 EVT PtrVT = getPointerTy(DAG.getDataLayout());
18692 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18693 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18694 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18695 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18696 "Invalid Frame Register!");
18697 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18698 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18699
18700 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18701 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
18702 dl));
18703 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18704 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18705 false, false, 0);
18706 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18707
18708 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18709 DAG.getRegister(StoreAddrReg, PtrVT));
18710 }
18711
lowerEH_SJLJ_SETJMP(SDValue Op,SelectionDAG & DAG) const18712 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18713 SelectionDAG &DAG) const {
18714 SDLoc DL(Op);
18715 // If the subtarget is not 64bit, we may need the global base reg
18716 // after isel expand pseudo, i.e., after CGBR pass ran.
18717 // Therefore, ask for the GlobalBaseReg now, so that the pass
18718 // inserts the code for us in case we need it.
18719 // Otherwise, we will end up in a situation where we will
18720 // reference a virtual register that is not defined!
18721 if (!Subtarget.is64Bit()) {
18722 const X86InstrInfo *TII = Subtarget.getInstrInfo();
18723 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
18724 }
18725 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18726 DAG.getVTList(MVT::i32, MVT::Other),
18727 Op.getOperand(0), Op.getOperand(1));
18728 }
18729
lowerEH_SJLJ_LONGJMP(SDValue Op,SelectionDAG & DAG) const18730 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18731 SelectionDAG &DAG) const {
18732 SDLoc DL(Op);
18733 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18734 Op.getOperand(0), Op.getOperand(1));
18735 }
18736
lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,SelectionDAG & DAG) const18737 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
18738 SelectionDAG &DAG) const {
18739 SDLoc DL(Op);
18740 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
18741 Op.getOperand(0));
18742 }
18743
LowerADJUST_TRAMPOLINE(SDValue Op,SelectionDAG & DAG)18744 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18745 return Op.getOperand(0);
18746 }
18747
LowerINIT_TRAMPOLINE(SDValue Op,SelectionDAG & DAG) const18748 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18749 SelectionDAG &DAG) const {
18750 SDValue Root = Op.getOperand(0);
18751 SDValue Trmp = Op.getOperand(1); // trampoline
18752 SDValue FPtr = Op.getOperand(2); // nested function
18753 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18754 SDLoc dl (Op);
18755
18756 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18757 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
18758
18759 if (Subtarget.is64Bit()) {
18760 SDValue OutChains[6];
18761
18762 // Large code-model.
18763 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
18764 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18765
18766 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18767 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18768
18769 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18770
18771 // Load the pointer to the nested function into R11.
18772 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18773 SDValue Addr = Trmp;
18774 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18775 Addr, MachinePointerInfo(TrmpAddr),
18776 false, false, 0);
18777
18778 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18779 DAG.getConstant(2, dl, MVT::i64));
18780 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18781 MachinePointerInfo(TrmpAddr, 2),
18782 false, false, 2);
18783
18784 // Load the 'nest' parameter value into R10.
18785 // R10 is specified in X86CallingConv.td
18786 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18787 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18788 DAG.getConstant(10, dl, MVT::i64));
18789 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18790 Addr, MachinePointerInfo(TrmpAddr, 10),
18791 false, false, 0);
18792
18793 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18794 DAG.getConstant(12, dl, MVT::i64));
18795 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18796 MachinePointerInfo(TrmpAddr, 12),
18797 false, false, 2);
18798
18799 // Jump to the nested function.
18800 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18801 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18802 DAG.getConstant(20, dl, MVT::i64));
18803 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18804 Addr, MachinePointerInfo(TrmpAddr, 20),
18805 false, false, 0);
18806
18807 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18808 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18809 DAG.getConstant(22, dl, MVT::i64));
18810 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
18811 Addr, MachinePointerInfo(TrmpAddr, 22),
18812 false, false, 0);
18813
18814 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18815 } else {
18816 const Function *Func =
18817 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18818 CallingConv::ID CC = Func->getCallingConv();
18819 unsigned NestReg;
18820
18821 switch (CC) {
18822 default:
18823 llvm_unreachable("Unsupported calling convention");
18824 case CallingConv::C:
18825 case CallingConv::X86_StdCall: {
18826 // Pass 'nest' parameter in ECX.
18827 // Must be kept in sync with X86CallingConv.td
18828 NestReg = X86::ECX;
18829
18830 // Check that ECX wasn't needed by an 'inreg' parameter.
18831 FunctionType *FTy = Func->getFunctionType();
18832 const AttributeSet &Attrs = Func->getAttributes();
18833
18834 if (!Attrs.isEmpty() && !Func->isVarArg()) {
18835 unsigned InRegCount = 0;
18836 unsigned Idx = 1;
18837
18838 for (FunctionType::param_iterator I = FTy->param_begin(),
18839 E = FTy->param_end(); I != E; ++I, ++Idx)
18840 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
18841 auto &DL = DAG.getDataLayout();
18842 // FIXME: should only count parameters that are lowered to integers.
18843 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
18844 }
18845
18846 if (InRegCount > 2) {
18847 report_fatal_error("Nest register in use - reduce number of inreg"
18848 " parameters!");
18849 }
18850 }
18851 break;
18852 }
18853 case CallingConv::X86_FastCall:
18854 case CallingConv::X86_ThisCall:
18855 case CallingConv::Fast:
18856 // Pass 'nest' parameter in EAX.
18857 // Must be kept in sync with X86CallingConv.td
18858 NestReg = X86::EAX;
18859 break;
18860 }
18861
18862 SDValue OutChains[4];
18863 SDValue Addr, Disp;
18864
18865 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18866 DAG.getConstant(10, dl, MVT::i32));
18867 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18868
18869 // This is storing the opcode for MOV32ri.
18870 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18871 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18872 OutChains[0] = DAG.getStore(Root, dl,
18873 DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
18874 Trmp, MachinePointerInfo(TrmpAddr),
18875 false, false, 0);
18876
18877 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18878 DAG.getConstant(1, dl, MVT::i32));
18879 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18880 MachinePointerInfo(TrmpAddr, 1),
18881 false, false, 1);
18882
18883 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18884 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18885 DAG.getConstant(5, dl, MVT::i32));
18886 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
18887 Addr, MachinePointerInfo(TrmpAddr, 5),
18888 false, false, 1);
18889
18890 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18891 DAG.getConstant(6, dl, MVT::i32));
18892 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18893 MachinePointerInfo(TrmpAddr, 6),
18894 false, false, 1);
18895
18896 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18897 }
18898 }
18899
LowerFLT_ROUNDS_(SDValue Op,SelectionDAG & DAG) const18900 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18901 SelectionDAG &DAG) const {
18902 /*
18903 The rounding mode is in bits 11:10 of FPSR, and has the following
18904 settings:
18905 00 Round to nearest
18906 01 Round to -inf
18907 10 Round to +inf
18908 11 Round to 0
18909
18910 FLT_ROUNDS, on the other hand, expects the following:
18911 -1 Undefined
18912 0 Round to 0
18913 1 Round to nearest
18914 2 Round to +inf
18915 3 Round to -inf
18916
18917 To perform the conversion, we do:
18918 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18919 */
18920
18921 MachineFunction &MF = DAG.getMachineFunction();
18922 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18923 unsigned StackAlignment = TFI.getStackAlignment();
18924 MVT VT = Op.getSimpleValueType();
18925 SDLoc DL(Op);
18926
18927 // Save FP Control Word to stack slot
18928 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18929 SDValue StackSlot =
18930 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
18931
18932 MachineMemOperand *MMO =
18933 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
18934 MachineMemOperand::MOStore, 2, 2);
18935
18936 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18937 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18938 DAG.getVTList(MVT::Other),
18939 Ops, MVT::i16, MMO);
18940
18941 // Load FP Control Word from stack slot
18942 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18943 MachinePointerInfo(), false, false, false, 0);
18944
18945 // Transform as necessary
18946 SDValue CWD1 =
18947 DAG.getNode(ISD::SRL, DL, MVT::i16,
18948 DAG.getNode(ISD::AND, DL, MVT::i16,
18949 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
18950 DAG.getConstant(11, DL, MVT::i8));
18951 SDValue CWD2 =
18952 DAG.getNode(ISD::SRL, DL, MVT::i16,
18953 DAG.getNode(ISD::AND, DL, MVT::i16,
18954 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
18955 DAG.getConstant(9, DL, MVT::i8));
18956
18957 SDValue RetVal =
18958 DAG.getNode(ISD::AND, DL, MVT::i16,
18959 DAG.getNode(ISD::ADD, DL, MVT::i16,
18960 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18961 DAG.getConstant(1, DL, MVT::i16)),
18962 DAG.getConstant(3, DL, MVT::i16));
18963
18964 return DAG.getNode((VT.getSizeInBits() < 16 ?
18965 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18966 }
18967
18968 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
18969 //
18970 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
18971 // to 512-bit vector.
18972 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
18973 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
18974 // split the vector, perform operation on it's Lo a Hi part and
18975 // concatenate the results.
LowerVectorCTLZ_AVX512(SDValue Op,SelectionDAG & DAG)18976 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
18977 assert(Op.getOpcode() == ISD::CTLZ);
18978 SDLoc dl(Op);
18979 MVT VT = Op.getSimpleValueType();
18980 MVT EltVT = VT.getVectorElementType();
18981 unsigned NumElems = VT.getVectorNumElements();
18982
18983 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
18984 // Extend to 512 bit vector.
18985 assert((VT.is256BitVector() || VT.is128BitVector()) &&
18986 "Unsupported value type for operation");
18987
18988 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
18989 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
18990 DAG.getUNDEF(NewVT),
18991 Op.getOperand(0),
18992 DAG.getIntPtrConstant(0, dl));
18993 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
18994
18995 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
18996 DAG.getIntPtrConstant(0, dl));
18997 }
18998
18999 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
19000 "Unsupported element type");
19001
19002 if (16 < NumElems) {
19003 // Split vector, it's Lo and Hi parts will be handled in next iteration.
19004 SDValue Lo, Hi;
19005 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
19006 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
19007
19008 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
19009 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
19010
19011 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
19012 }
19013
19014 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
19015
19016 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
19017 "Unsupported value type for operation");
19018
19019 // Use native supported vector instruction vplzcntd.
19020 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
19021 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
19022 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
19023 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
19024
19025 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
19026 }
19027
19028 // Lower CTLZ using a PSHUFB lookup table implementation.
LowerVectorCTLZInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)19029 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
19030 const X86Subtarget &Subtarget,
19031 SelectionDAG &DAG) {
19032 MVT VT = Op.getSimpleValueType();
19033 int NumElts = VT.getVectorNumElements();
19034 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
19035 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
19036
19037 // Per-nibble leading zero PSHUFB lookup table.
19038 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
19039 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
19040 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
19041 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
19042
19043 SmallVector<SDValue, 64> LUTVec;
19044 for (int i = 0; i < NumBytes; ++i)
19045 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
19046 SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
19047
19048 // Begin by bitcasting the input to byte vector, then split those bytes
19049 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
19050 // If the hi input nibble is zero then we add both results together, otherwise
19051 // we just take the hi result (by masking the lo result to zero before the
19052 // add).
19053 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
19054 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
19055
19056 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
19057 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
19058 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
19059 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
19060 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
19061
19062 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
19063 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
19064 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
19065 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
19066
19067 // Merge result back from vXi8 back to VT, working on the lo/hi halves
19068 // of the current vector width in the same way we did for the nibbles.
19069 // If the upper half of the input element is zero then add the halves'
19070 // leading zero counts together, otherwise just use the upper half's.
19071 // Double the width of the result until we are at target width.
19072 while (CurrVT != VT) {
19073 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
19074 int CurrNumElts = CurrVT.getVectorNumElements();
19075 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
19076 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
19077 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
19078
19079 // Check if the upper half of the input element is zero.
19080 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
19081 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
19082 HiZ = DAG.getBitcast(NextVT, HiZ);
19083
19084 // Move the upper/lower halves to the lower bits as we'll be extending to
19085 // NextVT. Mask the lower result to zero if HiZ is true and add the results
19086 // together.
19087 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
19088 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
19089 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
19090 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
19091 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
19092 CurrVT = NextVT;
19093 }
19094
19095 return Res;
19096 }
19097
LowerVectorCTLZ(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)19098 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
19099 const X86Subtarget &Subtarget,
19100 SelectionDAG &DAG) {
19101 MVT VT = Op.getSimpleValueType();
19102 SDValue Op0 = Op.getOperand(0);
19103
19104 if (Subtarget.hasAVX512())
19105 return LowerVectorCTLZ_AVX512(Op, DAG);
19106
19107 // Decompose 256-bit ops into smaller 128-bit ops.
19108 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19109 unsigned NumElems = VT.getVectorNumElements();
19110
19111 // Extract each 128-bit vector, perform ctlz and concat the result.
19112 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
19113 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
19114
19115 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
19116 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
19117 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
19118 }
19119
19120 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
19121 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
19122 }
19123
LowerCTLZ(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19124 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
19125 SelectionDAG &DAG) {
19126 MVT VT = Op.getSimpleValueType();
19127 MVT OpVT = VT;
19128 unsigned NumBits = VT.getSizeInBits();
19129 SDLoc dl(Op);
19130 unsigned Opc = Op.getOpcode();
19131
19132 if (VT.isVector())
19133 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
19134
19135 Op = Op.getOperand(0);
19136 if (VT == MVT::i8) {
19137 // Zero extend to i32 since there is not an i8 bsr.
19138 OpVT = MVT::i32;
19139 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
19140 }
19141
19142 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
19143 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
19144 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
19145
19146 if (Opc == ISD::CTLZ) {
19147 // If src is zero (i.e. bsr sets ZF), returns NumBits.
19148 SDValue Ops[] = {
19149 Op,
19150 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
19151 DAG.getConstant(X86::COND_E, dl, MVT::i8),
19152 Op.getValue(1)
19153 };
19154 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
19155 }
19156
19157 // Finally xor with NumBits-1.
19158 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
19159 DAG.getConstant(NumBits - 1, dl, OpVT));
19160
19161 if (VT == MVT::i8)
19162 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
19163 return Op;
19164 }
19165
LowerCTTZ(SDValue Op,SelectionDAG & DAG)19166 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
19167 MVT VT = Op.getSimpleValueType();
19168 unsigned NumBits = VT.getScalarSizeInBits();
19169 SDLoc dl(Op);
19170
19171 if (VT.isVector()) {
19172 SDValue N0 = Op.getOperand(0);
19173 SDValue Zero = DAG.getConstant(0, dl, VT);
19174
19175 // lsb(x) = (x & -x)
19176 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
19177 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
19178
19179 // cttz_undef(x) = (width - 1) - ctlz(lsb)
19180 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
19181 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
19182 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
19183 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
19184 }
19185
19186 // cttz(x) = ctpop(lsb - 1)
19187 SDValue One = DAG.getConstant(1, dl, VT);
19188 return DAG.getNode(ISD::CTPOP, dl, VT,
19189 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
19190 }
19191
19192 assert(Op.getOpcode() == ISD::CTTZ &&
19193 "Only scalar CTTZ requires custom lowering");
19194
19195 // Issue a bsf (scan bits forward) which also sets EFLAGS.
19196 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19197 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
19198
19199 // If src is zero (i.e. bsf sets ZF), returns NumBits.
19200 SDValue Ops[] = {
19201 Op,
19202 DAG.getConstant(NumBits, dl, VT),
19203 DAG.getConstant(X86::COND_E, dl, MVT::i8),
19204 Op.getValue(1)
19205 };
19206 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
19207 }
19208
19209 /// Break a 256-bit integer operation into two new 128-bit ones and then
19210 /// concatenate the result back.
Lower256IntArith(SDValue Op,SelectionDAG & DAG)19211 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
19212 MVT VT = Op.getSimpleValueType();
19213
19214 assert(VT.is256BitVector() && VT.isInteger() &&
19215 "Unsupported value type for operation");
19216
19217 unsigned NumElems = VT.getVectorNumElements();
19218 SDLoc dl(Op);
19219
19220 // Extract the LHS vectors
19221 SDValue LHS = Op.getOperand(0);
19222 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
19223 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
19224
19225 // Extract the RHS vectors
19226 SDValue RHS = Op.getOperand(1);
19227 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
19228 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
19229
19230 MVT EltVT = VT.getVectorElementType();
19231 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19232
19233 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19234 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19235 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19236 }
19237
19238 /// Break a 512-bit integer operation into two new 256-bit ones and then
19239 /// concatenate the result back.
Lower512IntArith(SDValue Op,SelectionDAG & DAG)19240 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
19241 MVT VT = Op.getSimpleValueType();
19242
19243 assert(VT.is512BitVector() && VT.isInteger() &&
19244 "Unsupported value type for operation");
19245
19246 unsigned NumElems = VT.getVectorNumElements();
19247 SDLoc dl(Op);
19248
19249 // Extract the LHS vectors
19250 SDValue LHS = Op.getOperand(0);
19251 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
19252 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
19253
19254 // Extract the RHS vectors
19255 SDValue RHS = Op.getOperand(1);
19256 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
19257 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
19258
19259 MVT EltVT = VT.getVectorElementType();
19260 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19261
19262 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19263 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19264 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19265 }
19266
LowerADD(SDValue Op,SelectionDAG & DAG)19267 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
19268 if (Op.getValueType() == MVT::i1)
19269 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19270 Op.getOperand(0), Op.getOperand(1));
19271 assert(Op.getSimpleValueType().is256BitVector() &&
19272 Op.getSimpleValueType().isInteger() &&
19273 "Only handle AVX 256-bit vector integer operation");
19274 return Lower256IntArith(Op, DAG);
19275 }
19276
LowerSUB(SDValue Op,SelectionDAG & DAG)19277 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
19278 if (Op.getValueType() == MVT::i1)
19279 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19280 Op.getOperand(0), Op.getOperand(1));
19281 assert(Op.getSimpleValueType().is256BitVector() &&
19282 Op.getSimpleValueType().isInteger() &&
19283 "Only handle AVX 256-bit vector integer operation");
19284 return Lower256IntArith(Op, DAG);
19285 }
19286
LowerMINMAX(SDValue Op,SelectionDAG & DAG)19287 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
19288 assert(Op.getSimpleValueType().is256BitVector() &&
19289 Op.getSimpleValueType().isInteger() &&
19290 "Only handle AVX 256-bit vector integer operation");
19291 return Lower256IntArith(Op, DAG);
19292 }
19293
LowerMUL(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19294 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
19295 SelectionDAG &DAG) {
19296 SDLoc dl(Op);
19297 MVT VT = Op.getSimpleValueType();
19298
19299 if (VT == MVT::i1)
19300 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
19301
19302 // Decompose 256-bit ops into smaller 128-bit ops.
19303 if (VT.is256BitVector() && !Subtarget.hasInt256())
19304 return Lower256IntArith(Op, DAG);
19305
19306 SDValue A = Op.getOperand(0);
19307 SDValue B = Op.getOperand(1);
19308
19309 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
19310 // vector pairs, multiply and truncate.
19311 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
19312 if (Subtarget.hasInt256()) {
19313 // For 512-bit vectors, split into 256-bit vectors to allow the
19314 // sign-extension to occur.
19315 if (VT == MVT::v64i8)
19316 return Lower512IntArith(Op, DAG);
19317
19318 // For 256-bit vectors, split into 128-bit vectors to allow the
19319 // sign-extension to occur. We don't need this on AVX512BW as we can
19320 // safely sign-extend to v32i16.
19321 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
19322 return Lower256IntArith(Op, DAG);
19323
19324 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
19325 return DAG.getNode(
19326 ISD::TRUNCATE, dl, VT,
19327 DAG.getNode(ISD::MUL, dl, ExVT,
19328 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
19329 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
19330 }
19331
19332 assert(VT == MVT::v16i8 &&
19333 "Pre-AVX2 support only supports v16i8 multiplication");
19334 MVT ExVT = MVT::v8i16;
19335
19336 // Extract the lo parts and sign extend to i16
19337 SDValue ALo, BLo;
19338 if (Subtarget.hasSSE41()) {
19339 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
19340 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
19341 } else {
19342 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19343 -1, 4, -1, 5, -1, 6, -1, 7};
19344 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19345 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19346 ALo = DAG.getBitcast(ExVT, ALo);
19347 BLo = DAG.getBitcast(ExVT, BLo);
19348 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19349 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19350 }
19351
19352 // Extract the hi parts and sign extend to i16
19353 SDValue AHi, BHi;
19354 if (Subtarget.hasSSE41()) {
19355 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
19356 -1, -1, -1, -1, -1, -1, -1, -1};
19357 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19358 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19359 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
19360 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
19361 } else {
19362 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
19363 -1, 12, -1, 13, -1, 14, -1, 15};
19364 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19365 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19366 AHi = DAG.getBitcast(ExVT, AHi);
19367 BHi = DAG.getBitcast(ExVT, BHi);
19368 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19369 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19370 }
19371
19372 // Multiply, mask the lower 8bits of the lo/hi results and pack
19373 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19374 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19375 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
19376 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
19377 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19378 }
19379
19380 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
19381 if (VT == MVT::v4i32) {
19382 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
19383 "Should not custom lower when pmuldq is available!");
19384
19385 // Extract the odd parts.
19386 static const int UnpackMask[] = { 1, -1, 3, -1 };
19387 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
19388 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
19389
19390 // Multiply the even parts.
19391 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
19392 // Now multiply odd parts.
19393 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
19394
19395 Evens = DAG.getBitcast(VT, Evens);
19396 Odds = DAG.getBitcast(VT, Odds);
19397
19398 // Merge the two vectors back together with a shuffle. This expands into 2
19399 // shuffles.
19400 static const int ShufMask[] = { 0, 4, 2, 6 };
19401 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
19402 }
19403
19404 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
19405 "Only know how to lower V2I64/V4I64/V8I64 multiply");
19406
19407 // Ahi = psrlqi(a, 32);
19408 // Bhi = psrlqi(b, 32);
19409 //
19410 // AloBlo = pmuludq(a, b);
19411 // AloBhi = pmuludq(a, Bhi);
19412 // AhiBlo = pmuludq(Ahi, b);
19413
19414 // AloBhi = psllqi(AloBhi, 32);
19415 // AhiBlo = psllqi(AhiBlo, 32);
19416 // return AloBlo + AloBhi + AhiBlo;
19417
19418 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
19419 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
19420
19421 SDValue AhiBlo = Ahi;
19422 SDValue AloBhi = Bhi;
19423 // Bit cast to 32-bit vectors for MULUDQ
19424 MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
19425 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
19426 A = DAG.getBitcast(MulVT, A);
19427 B = DAG.getBitcast(MulVT, B);
19428 Ahi = DAG.getBitcast(MulVT, Ahi);
19429 Bhi = DAG.getBitcast(MulVT, Bhi);
19430
19431 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
19432 // After shifting right const values the result may be all-zero.
19433 if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
19434 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
19435 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
19436 }
19437 if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
19438 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
19439 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
19440 }
19441
19442 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
19443 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
19444 }
19445
LowerMULH(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19446 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
19447 SelectionDAG &DAG) {
19448 SDLoc dl(Op);
19449 MVT VT = Op.getSimpleValueType();
19450
19451 // Decompose 256-bit ops into smaller 128-bit ops.
19452 if (VT.is256BitVector() && !Subtarget.hasInt256())
19453 return Lower256IntArith(Op, DAG);
19454
19455 // Only i8 vectors should need custom lowering after this.
19456 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
19457 "Unsupported vector type");
19458
19459 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
19460 // logical shift down the upper half and pack back to i8.
19461 SDValue A = Op.getOperand(0);
19462 SDValue B = Op.getOperand(1);
19463
19464 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
19465 // and then ashr/lshr the upper bits down to the lower bits before multiply.
19466 unsigned Opcode = Op.getOpcode();
19467 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
19468 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
19469
19470 // AVX2 implementations - extend xmm subvectors to ymm.
19471 if (Subtarget.hasInt256()) {
19472 SDValue Lo = DAG.getIntPtrConstant(0, dl);
19473 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
19474
19475 if (VT == MVT::v32i8) {
19476 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
19477 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
19478 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
19479 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
19480 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
19481 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
19482 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
19483 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
19484 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19485 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
19486 DAG.getConstant(8, dl, MVT::v16i16));
19487 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19488 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
19489 DAG.getConstant(8, dl, MVT::v16i16));
19490 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
19491 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
19492 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
19493 16, 17, 18, 19, 20, 21, 22, 23};
19494 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
19495 24, 25, 26, 27, 28, 29, 30, 31};
19496 return DAG.getNode(X86ISD::PACKUS, dl, VT,
19497 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
19498 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
19499 }
19500
19501 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
19502 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
19503 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
19504 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
19505 DAG.getConstant(8, dl, MVT::v16i16));
19506 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
19507 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
19508 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
19509 }
19510
19511 assert(VT == MVT::v16i8 &&
19512 "Pre-AVX2 support only supports v16i8 multiplication");
19513 MVT ExVT = MVT::v8i16;
19514
19515 // Extract the lo parts and zero/sign extend to i16.
19516 SDValue ALo, BLo;
19517 if (Subtarget.hasSSE41()) {
19518 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
19519 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
19520 } else {
19521 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19522 -1, 4, -1, 5, -1, 6, -1, 7};
19523 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19524 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19525 ALo = DAG.getBitcast(ExVT, ALo);
19526 BLo = DAG.getBitcast(ExVT, BLo);
19527 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19528 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19529 }
19530
19531 // Extract the hi parts and zero/sign extend to i16.
19532 SDValue AHi, BHi;
19533 if (Subtarget.hasSSE41()) {
19534 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
19535 -1, -1, -1, -1, -1, -1, -1, -1};
19536 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19537 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19538 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
19539 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
19540 } else {
19541 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
19542 -1, 12, -1, 13, -1, 14, -1, 15};
19543 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19544 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19545 AHi = DAG.getBitcast(ExVT, AHi);
19546 BHi = DAG.getBitcast(ExVT, BHi);
19547 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19548 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19549 }
19550
19551 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
19552 // pack back to v16i8.
19553 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19554 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19555 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
19556 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
19557 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19558 }
19559
LowerWin64_i128OP(SDValue Op,SelectionDAG & DAG) const19560 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
19561 assert(Subtarget.isTargetWin64() && "Unexpected target");
19562 EVT VT = Op.getValueType();
19563 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
19564 "Unexpected return type for lowering");
19565
19566 RTLIB::Libcall LC;
19567 bool isSigned;
19568 switch (Op->getOpcode()) {
19569 default: llvm_unreachable("Unexpected request for libcall!");
19570 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
19571 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
19572 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
19573 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
19574 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
19575 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
19576 }
19577
19578 SDLoc dl(Op);
19579 SDValue InChain = DAG.getEntryNode();
19580
19581 TargetLowering::ArgListTy Args;
19582 TargetLowering::ArgListEntry Entry;
19583 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
19584 EVT ArgVT = Op->getOperand(i).getValueType();
19585 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
19586 "Unexpected argument type for lowering");
19587 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
19588 Entry.Node = StackPtr;
19589 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
19590 false, false, 16);
19591 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19592 Entry.Ty = PointerType::get(ArgTy,0);
19593 Entry.isSExt = false;
19594 Entry.isZExt = false;
19595 Args.push_back(Entry);
19596 }
19597
19598 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
19599 getPointerTy(DAG.getDataLayout()));
19600
19601 TargetLowering::CallLoweringInfo CLI(DAG);
19602 CLI.setDebugLoc(dl).setChain(InChain)
19603 .setCallee(getLibcallCallingConv(LC),
19604 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
19605 Callee, std::move(Args))
19606 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
19607
19608 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
19609 return DAG.getBitcast(VT, CallInfo.first);
19610 }
19611
LowerMUL_LOHI(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19612 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
19613 SelectionDAG &DAG) {
19614 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
19615 MVT VT = Op0.getSimpleValueType();
19616 SDLoc dl(Op);
19617
19618 // Decompose 256-bit ops into smaller 128-bit ops.
19619 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19620 unsigned Opcode = Op.getOpcode();
19621 unsigned NumElems = VT.getVectorNumElements();
19622 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
19623 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
19624 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
19625 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
19626 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
19627 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
19628 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
19629 SDValue Ops[] = {
19630 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
19631 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
19632 };
19633 return DAG.getMergeValues(Ops, dl);
19634 }
19635
19636 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
19637 (VT == MVT::v8i32 && Subtarget.hasInt256()));
19638
19639 // PMULxD operations multiply each even value (starting at 0) of LHS with
19640 // the related value of RHS and produce a widen result.
19641 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19642 // => <2 x i64> <ae|cg>
19643 //
19644 // In other word, to have all the results, we need to perform two PMULxD:
19645 // 1. one with the even values.
19646 // 2. one with the odd values.
19647 // To achieve #2, with need to place the odd values at an even position.
19648 //
19649 // Place the odd value at an even position (basically, shift all values 1
19650 // step to the left):
19651 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
19652 // <a|b|c|d> => <b|undef|d|undef>
19653 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
19654 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19655 // <e|f|g|h> => <f|undef|h|undef>
19656 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
19657 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19658
19659 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
19660 // ints.
19661 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
19662 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
19663 unsigned Opcode =
19664 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
19665 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19666 // => <2 x i64> <ae|cg>
19667 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
19668 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
19669 // => <2 x i64> <bf|dh>
19670 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
19671
19672 // Shuffle it back into the right order.
19673 SDValue Highs, Lows;
19674 if (VT == MVT::v8i32) {
19675 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
19676 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19677 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
19678 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19679 } else {
19680 const int HighMask[] = {1, 5, 3, 7};
19681 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19682 const int LowMask[] = {0, 4, 2, 6};
19683 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19684 }
19685
19686 // If we have a signed multiply but no PMULDQ fix up the high parts of a
19687 // unsigned multiply.
19688 if (IsSigned && !Subtarget.hasSSE41()) {
19689 SDValue ShAmt = DAG.getConstant(
19690 31, dl,
19691 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
19692 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
19693 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
19694 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
19695 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
19696
19697 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
19698 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
19699 }
19700
19701 // The first result of MUL_LOHI is actually the low value, followed by the
19702 // high value.
19703 SDValue Ops[] = {Lows, Highs};
19704 return DAG.getMergeValues(Ops, dl);
19705 }
19706
19707 // Return true if the required (according to Opcode) shift-imm form is natively
19708 // supported by the Subtarget
SupportedVectorShiftWithImm(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)19709 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
19710 unsigned Opcode) {
19711 if (VT.getScalarSizeInBits() < 16)
19712 return false;
19713
19714 if (VT.is512BitVector() &&
19715 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
19716 return true;
19717
19718 bool LShift = VT.is128BitVector() ||
19719 (VT.is256BitVector() && Subtarget.hasInt256());
19720
19721 bool AShift = LShift && (Subtarget.hasVLX() ||
19722 (VT != MVT::v2i64 && VT != MVT::v4i64));
19723 return (Opcode == ISD::SRA) ? AShift : LShift;
19724 }
19725
19726 // The shift amount is a variable, but it is the same for all vector lanes.
19727 // These instructions are defined together with shift-immediate.
19728 static
SupportedVectorShiftWithBaseAmnt(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)19729 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
19730 unsigned Opcode) {
19731 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
19732 }
19733
19734 // Return true if the required (according to Opcode) variable-shift form is
19735 // natively supported by the Subtarget
SupportedVectorVarShift(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)19736 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
19737 unsigned Opcode) {
19738
19739 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
19740 return false;
19741
19742 // vXi16 supported only on AVX-512, BWI
19743 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
19744 return false;
19745
19746 if (VT.is512BitVector() || Subtarget.hasVLX())
19747 return true;
19748
19749 bool LShift = VT.is128BitVector() || VT.is256BitVector();
19750 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
19751 return (Opcode == ISD::SRA) ? AShift : LShift;
19752 }
19753
LowerScalarImmediateShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19754 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
19755 const X86Subtarget &Subtarget) {
19756 MVT VT = Op.getSimpleValueType();
19757 SDLoc dl(Op);
19758 SDValue R = Op.getOperand(0);
19759 SDValue Amt = Op.getOperand(1);
19760
19761 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19762 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19763
19764 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
19765 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
19766 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
19767 SDValue Ex = DAG.getBitcast(ExVT, R);
19768
19769 if (ShiftAmt >= 32) {
19770 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
19771 SDValue Upper =
19772 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
19773 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19774 ShiftAmt - 32, DAG);
19775 if (VT == MVT::v2i64)
19776 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
19777 if (VT == MVT::v4i64)
19778 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19779 {9, 1, 11, 3, 13, 5, 15, 7});
19780 } else {
19781 // SRA upper i32, SHL whole i64 and select lower i32.
19782 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19783 ShiftAmt, DAG);
19784 SDValue Lower =
19785 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
19786 Lower = DAG.getBitcast(ExVT, Lower);
19787 if (VT == MVT::v2i64)
19788 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
19789 if (VT == MVT::v4i64)
19790 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19791 {8, 1, 10, 3, 12, 5, 14, 7});
19792 }
19793 return DAG.getBitcast(VT, Ex);
19794 };
19795
19796 // Optimize shl/srl/sra with constant shift amount.
19797 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
19798 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
19799 uint64_t ShiftAmt = ShiftConst->getZExtValue();
19800
19801 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19802 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19803
19804 // i64 SRA needs to be performed as partial shifts.
19805 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
19806 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
19807 return ArithmeticShiftRight64(ShiftAmt);
19808
19809 if (VT == MVT::v16i8 ||
19810 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
19811 VT == MVT::v64i8) {
19812 unsigned NumElts = VT.getVectorNumElements();
19813 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
19814
19815 // Simple i8 add case
19816 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
19817 return DAG.getNode(ISD::ADD, dl, VT, R, R);
19818
19819 // ashr(R, 7) === cmp_slt(R, 0)
19820 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
19821 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
19822 if (VT.is512BitVector()) {
19823 assert(VT == MVT::v64i8 && "Unexpected element type!");
19824 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
19825 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
19826 }
19827 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
19828 }
19829
19830 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
19831 if (VT == MVT::v16i8 && Subtarget.hasXOP())
19832 return SDValue();
19833
19834 if (Op.getOpcode() == ISD::SHL) {
19835 // Make a large shift.
19836 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
19837 R, ShiftAmt, DAG);
19838 SHL = DAG.getBitcast(VT, SHL);
19839 // Zero out the rightmost bits.
19840 return DAG.getNode(ISD::AND, dl, VT, SHL,
19841 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
19842 }
19843 if (Op.getOpcode() == ISD::SRL) {
19844 // Make a large shift.
19845 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
19846 R, ShiftAmt, DAG);
19847 SRL = DAG.getBitcast(VT, SRL);
19848 // Zero out the leftmost bits.
19849 return DAG.getNode(ISD::AND, dl, VT, SRL,
19850 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
19851 }
19852 if (Op.getOpcode() == ISD::SRA) {
19853 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
19854 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
19855
19856 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
19857 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
19858 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
19859 return Res;
19860 }
19861 llvm_unreachable("Unknown shift opcode.");
19862 }
19863 }
19864 }
19865
19866 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19867 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
19868 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
19869
19870 // Peek through any splat that was introduced for i64 shift vectorization.
19871 int SplatIndex = -1;
19872 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
19873 if (SVN->isSplat()) {
19874 SplatIndex = SVN->getSplatIndex();
19875 Amt = Amt.getOperand(0);
19876 assert(SplatIndex < (int)VT.getVectorNumElements() &&
19877 "Splat shuffle referencing second operand");
19878 }
19879
19880 if (Amt.getOpcode() != ISD::BITCAST ||
19881 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
19882 return SDValue();
19883
19884 Amt = Amt.getOperand(0);
19885 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19886 VT.getVectorNumElements();
19887 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
19888 uint64_t ShiftAmt = 0;
19889 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
19890 for (unsigned i = 0; i != Ratio; ++i) {
19891 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
19892 if (!C)
19893 return SDValue();
19894 // 6 == Log2(64)
19895 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
19896 }
19897
19898 // Check remaining shift amounts (if not a splat).
19899 if (SplatIndex < 0) {
19900 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19901 uint64_t ShAmt = 0;
19902 for (unsigned j = 0; j != Ratio; ++j) {
19903 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
19904 if (!C)
19905 return SDValue();
19906 // 6 == Log2(64)
19907 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
19908 }
19909 if (ShAmt != ShiftAmt)
19910 return SDValue();
19911 }
19912 }
19913
19914 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19915 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19916
19917 if (Op.getOpcode() == ISD::SRA)
19918 return ArithmeticShiftRight64(ShiftAmt);
19919 }
19920
19921 return SDValue();
19922 }
19923
LowerScalarVariableShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19924 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
19925 const X86Subtarget &Subtarget) {
19926 MVT VT = Op.getSimpleValueType();
19927 SDLoc dl(Op);
19928 SDValue R = Op.getOperand(0);
19929 SDValue Amt = Op.getOperand(1);
19930
19931 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19932 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19933
19934 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
19935 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
19936
19937 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
19938 SDValue BaseShAmt;
19939 MVT EltVT = VT.getVectorElementType();
19940
19941 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
19942 // Check if this build_vector node is doing a splat.
19943 // If so, then set BaseShAmt equal to the splat value.
19944 BaseShAmt = BV->getSplatValue();
19945 if (BaseShAmt && BaseShAmt.isUndef())
19946 BaseShAmt = SDValue();
19947 } else {
19948 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
19949 Amt = Amt.getOperand(0);
19950
19951 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
19952 if (SVN && SVN->isSplat()) {
19953 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
19954 SDValue InVec = Amt.getOperand(0);
19955 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
19956 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
19957 "Unexpected shuffle index found!");
19958 BaseShAmt = InVec.getOperand(SplatIdx);
19959 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
19960 if (ConstantSDNode *C =
19961 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
19962 if (C->getZExtValue() == SplatIdx)
19963 BaseShAmt = InVec.getOperand(1);
19964 }
19965 }
19966
19967 if (!BaseShAmt)
19968 // Avoid introducing an extract element from a shuffle.
19969 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
19970 DAG.getIntPtrConstant(SplatIdx, dl));
19971 }
19972 }
19973
19974 if (BaseShAmt.getNode()) {
19975 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
19976 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
19977 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
19978 else if (EltVT.bitsLT(MVT::i32))
19979 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
19980
19981 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
19982 }
19983 }
19984
19985 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19986 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
19987 Amt.getOpcode() == ISD::BITCAST &&
19988 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19989 Amt = Amt.getOperand(0);
19990 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19991 VT.getVectorNumElements();
19992 std::vector<SDValue> Vals(Ratio);
19993 for (unsigned i = 0; i != Ratio; ++i)
19994 Vals[i] = Amt.getOperand(i);
19995 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19996 for (unsigned j = 0; j != Ratio; ++j)
19997 if (Vals[j] != Amt.getOperand(i + j))
19998 return SDValue();
19999 }
20000
20001 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
20002 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
20003 }
20004 return SDValue();
20005 }
20006
LowerShift(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20007 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
20008 SelectionDAG &DAG) {
20009 MVT VT = Op.getSimpleValueType();
20010 SDLoc dl(Op);
20011 SDValue R = Op.getOperand(0);
20012 SDValue Amt = Op.getOperand(1);
20013 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20014
20015 assert(VT.isVector() && "Custom lowering only for vector shifts!");
20016 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
20017
20018 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
20019 return V;
20020
20021 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
20022 return V;
20023
20024 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
20025 return Op;
20026
20027 // XOP has 128-bit variable logical/arithmetic shifts.
20028 // +ve/-ve Amt = shift left/right.
20029 if (Subtarget.hasXOP() &&
20030 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
20031 VT == MVT::v8i16 || VT == MVT::v16i8)) {
20032 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
20033 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20034 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
20035 }
20036 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
20037 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
20038 if (Op.getOpcode() == ISD::SRA)
20039 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
20040 }
20041
20042 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
20043 // shifts per-lane and then shuffle the partial results back together.
20044 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
20045 // Splat the shift amounts so the scalar shifts above will catch it.
20046 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
20047 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
20048 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
20049 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
20050 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
20051 }
20052
20053 // i64 vector arithmetic shift can be emulated with the transform:
20054 // M = lshr(SIGN_BIT, Amt)
20055 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
20056 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
20057 Op.getOpcode() == ISD::SRA) {
20058 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
20059 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
20060 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
20061 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
20062 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
20063 return R;
20064 }
20065
20066 // If possible, lower this packed shift into a vector multiply instead of
20067 // expanding it into a sequence of scalar shifts.
20068 // Do this only if the vector shift count is a constant build_vector.
20069 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
20070 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
20071 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
20072 SmallVector<SDValue, 8> Elts;
20073 MVT SVT = VT.getVectorElementType();
20074 unsigned SVTBits = SVT.getSizeInBits();
20075 APInt One(SVTBits, 1);
20076 unsigned NumElems = VT.getVectorNumElements();
20077
20078 for (unsigned i=0; i !=NumElems; ++i) {
20079 SDValue Op = Amt->getOperand(i);
20080 if (Op->isUndef()) {
20081 Elts.push_back(Op);
20082 continue;
20083 }
20084
20085 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
20086 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
20087 uint64_t ShAmt = C.getZExtValue();
20088 if (ShAmt >= SVTBits) {
20089 Elts.push_back(DAG.getUNDEF(SVT));
20090 continue;
20091 }
20092 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
20093 }
20094 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
20095 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
20096 }
20097
20098 // Lower SHL with variable shift amount.
20099 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
20100 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
20101
20102 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
20103 DAG.getConstant(0x3f800000U, dl, VT));
20104 Op = DAG.getBitcast(MVT::v4f32, Op);
20105 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
20106 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
20107 }
20108
20109 // If possible, lower this shift as a sequence of two shifts by
20110 // constant plus a MOVSS/MOVSD instead of scalarizing it.
20111 // Example:
20112 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
20113 //
20114 // Could be rewritten as:
20115 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
20116 //
20117 // The advantage is that the two shifts from the example would be
20118 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
20119 // the vector shift into four scalar shifts plus four pairs of vector
20120 // insert/extract.
20121 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
20122 unsigned TargetOpcode = X86ISD::MOVSS;
20123 bool CanBeSimplified;
20124 // The splat value for the first packed shift (the 'X' from the example).
20125 SDValue Amt1 = Amt->getOperand(0);
20126 // The splat value for the second packed shift (the 'Y' from the example).
20127 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
20128
20129 // See if it is possible to replace this node with a sequence of
20130 // two shifts followed by a MOVSS/MOVSD
20131 if (VT == MVT::v4i32) {
20132 // Check if it is legal to use a MOVSS.
20133 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
20134 Amt2 == Amt->getOperand(3);
20135 if (!CanBeSimplified) {
20136 // Otherwise, check if we can still simplify this node using a MOVSD.
20137 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
20138 Amt->getOperand(2) == Amt->getOperand(3);
20139 TargetOpcode = X86ISD::MOVSD;
20140 Amt2 = Amt->getOperand(2);
20141 }
20142 } else {
20143 // Do similar checks for the case where the machine value type
20144 // is MVT::v8i16.
20145 CanBeSimplified = Amt1 == Amt->getOperand(1);
20146 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
20147 CanBeSimplified = Amt2 == Amt->getOperand(i);
20148
20149 if (!CanBeSimplified) {
20150 TargetOpcode = X86ISD::MOVSD;
20151 CanBeSimplified = true;
20152 Amt2 = Amt->getOperand(4);
20153 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
20154 CanBeSimplified = Amt1 == Amt->getOperand(i);
20155 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
20156 CanBeSimplified = Amt2 == Amt->getOperand(j);
20157 }
20158 }
20159
20160 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
20161 isa<ConstantSDNode>(Amt2)) {
20162 // Replace this node with two shifts followed by a MOVSS/MOVSD.
20163 MVT CastVT = MVT::v4i32;
20164 SDValue Splat1 =
20165 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
20166 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
20167 SDValue Splat2 =
20168 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
20169 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
20170 if (TargetOpcode == X86ISD::MOVSD)
20171 CastVT = MVT::v2i64;
20172 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
20173 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
20174 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
20175 BitCast1, DAG);
20176 return DAG.getBitcast(VT, Result);
20177 }
20178 }
20179
20180 // v4i32 Non Uniform Shifts.
20181 // If the shift amount is constant we can shift each lane using the SSE2
20182 // immediate shifts, else we need to zero-extend each lane to the lower i64
20183 // and shift using the SSE2 variable shifts.
20184 // The separate results can then be blended together.
20185 if (VT == MVT::v4i32) {
20186 unsigned Opc = Op.getOpcode();
20187 SDValue Amt0, Amt1, Amt2, Amt3;
20188 if (ConstantAmt) {
20189 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
20190 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
20191 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
20192 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
20193 } else {
20194 // ISD::SHL is handled above but we include it here for completeness.
20195 switch (Opc) {
20196 default:
20197 llvm_unreachable("Unknown target vector shift node");
20198 case ISD::SHL:
20199 Opc = X86ISD::VSHL;
20200 break;
20201 case ISD::SRL:
20202 Opc = X86ISD::VSRL;
20203 break;
20204 case ISD::SRA:
20205 Opc = X86ISD::VSRA;
20206 break;
20207 }
20208 // The SSE2 shifts use the lower i64 as the same shift amount for
20209 // all lanes and the upper i64 is ignored. These shuffle masks
20210 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
20211 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20212 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
20213 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
20214 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
20215 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
20216 }
20217
20218 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
20219 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
20220 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
20221 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
20222 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
20223 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
20224 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
20225 }
20226
20227 if (VT == MVT::v16i8 ||
20228 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
20229 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
20230 unsigned ShiftOpcode = Op->getOpcode();
20231
20232 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
20233 // On SSE41 targets we make use of the fact that VSELECT lowers
20234 // to PBLENDVB which selects bytes based just on the sign bit.
20235 if (Subtarget.hasSSE41()) {
20236 V0 = DAG.getBitcast(VT, V0);
20237 V1 = DAG.getBitcast(VT, V1);
20238 Sel = DAG.getBitcast(VT, Sel);
20239 return DAG.getBitcast(SelVT,
20240 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
20241 }
20242 // On pre-SSE41 targets we test for the sign bit by comparing to
20243 // zero - a negative value will set all bits of the lanes to true
20244 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
20245 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
20246 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
20247 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
20248 };
20249
20250 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
20251 // We can safely do this using i16 shifts as we're only interested in
20252 // the 3 lower bits of each byte.
20253 Amt = DAG.getBitcast(ExtVT, Amt);
20254 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
20255 Amt = DAG.getBitcast(VT, Amt);
20256
20257 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
20258 // r = VSELECT(r, shift(r, 4), a);
20259 SDValue M =
20260 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20261 R = SignBitSelect(VT, Amt, M, R);
20262
20263 // a += a
20264 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20265
20266 // r = VSELECT(r, shift(r, 2), a);
20267 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20268 R = SignBitSelect(VT, Amt, M, R);
20269
20270 // a += a
20271 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20272
20273 // return VSELECT(r, shift(r, 1), a);
20274 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20275 R = SignBitSelect(VT, Amt, M, R);
20276 return R;
20277 }
20278
20279 if (Op->getOpcode() == ISD::SRA) {
20280 // For SRA we need to unpack each byte to the higher byte of a i16 vector
20281 // so we can correctly sign extend. We don't care what happens to the
20282 // lower byte.
20283 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
20284 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
20285 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
20286 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
20287 ALo = DAG.getBitcast(ExtVT, ALo);
20288 AHi = DAG.getBitcast(ExtVT, AHi);
20289 RLo = DAG.getBitcast(ExtVT, RLo);
20290 RHi = DAG.getBitcast(ExtVT, RHi);
20291
20292 // r = VSELECT(r, shift(r, 4), a);
20293 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20294 DAG.getConstant(4, dl, ExtVT));
20295 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20296 DAG.getConstant(4, dl, ExtVT));
20297 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20298 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20299
20300 // a += a
20301 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20302 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20303
20304 // r = VSELECT(r, shift(r, 2), a);
20305 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20306 DAG.getConstant(2, dl, ExtVT));
20307 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20308 DAG.getConstant(2, dl, ExtVT));
20309 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20310 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20311
20312 // a += a
20313 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20314 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20315
20316 // r = VSELECT(r, shift(r, 1), a);
20317 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20318 DAG.getConstant(1, dl, ExtVT));
20319 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20320 DAG.getConstant(1, dl, ExtVT));
20321 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20322 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20323
20324 // Logical shift the result back to the lower byte, leaving a zero upper
20325 // byte
20326 // meaning that we can safely pack with PACKUSWB.
20327 RLo =
20328 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
20329 RHi =
20330 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
20331 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20332 }
20333 }
20334
20335 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
20336 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
20337 // solution better.
20338 if (Subtarget.hasInt256() && VT == MVT::v8i16) {
20339 MVT ExtVT = MVT::v8i32;
20340 unsigned ExtOpc =
20341 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20342 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
20343 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
20344 return DAG.getNode(ISD::TRUNCATE, dl, VT,
20345 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
20346 }
20347
20348 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
20349 MVT ExtVT = MVT::v8i32;
20350 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20351 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
20352 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
20353 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
20354 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
20355 ALo = DAG.getBitcast(ExtVT, ALo);
20356 AHi = DAG.getBitcast(ExtVT, AHi);
20357 RLo = DAG.getBitcast(ExtVT, RLo);
20358 RHi = DAG.getBitcast(ExtVT, RHi);
20359 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
20360 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
20361 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
20362 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
20363 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20364 }
20365
20366 if (VT == MVT::v8i16) {
20367 unsigned ShiftOpcode = Op->getOpcode();
20368
20369 // If we have a constant shift amount, the non-SSE41 path is best as
20370 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
20371 bool UseSSE41 = Subtarget.hasSSE41() &&
20372 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20373
20374 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
20375 // On SSE41 targets we make use of the fact that VSELECT lowers
20376 // to PBLENDVB which selects bytes based just on the sign bit.
20377 if (UseSSE41) {
20378 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
20379 V0 = DAG.getBitcast(ExtVT, V0);
20380 V1 = DAG.getBitcast(ExtVT, V1);
20381 Sel = DAG.getBitcast(ExtVT, Sel);
20382 return DAG.getBitcast(
20383 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
20384 }
20385 // On pre-SSE41 targets we splat the sign bit - a negative value will
20386 // set all bits of the lanes to true and VSELECT uses that in
20387 // its OR(AND(V0,C),AND(V1,~C)) lowering.
20388 SDValue C =
20389 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
20390 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
20391 };
20392
20393 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
20394 if (UseSSE41) {
20395 // On SSE41 targets we need to replicate the shift mask in both
20396 // bytes for PBLENDVB.
20397 Amt = DAG.getNode(
20398 ISD::OR, dl, VT,
20399 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
20400 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
20401 } else {
20402 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
20403 }
20404
20405 // r = VSELECT(r, shift(r, 8), a);
20406 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
20407 R = SignBitSelect(Amt, M, R);
20408
20409 // a += a
20410 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20411
20412 // r = VSELECT(r, shift(r, 4), a);
20413 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20414 R = SignBitSelect(Amt, M, R);
20415
20416 // a += a
20417 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20418
20419 // r = VSELECT(r, shift(r, 2), a);
20420 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20421 R = SignBitSelect(Amt, M, R);
20422
20423 // a += a
20424 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20425
20426 // return VSELECT(r, shift(r, 1), a);
20427 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20428 R = SignBitSelect(Amt, M, R);
20429 return R;
20430 }
20431
20432 // Decompose 256-bit shifts into smaller 128-bit shifts.
20433 if (VT.is256BitVector())
20434 return Lower256IntArith(Op, DAG);
20435
20436 return SDValue();
20437 }
20438
LowerRotate(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20439 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
20440 SelectionDAG &DAG) {
20441 MVT VT = Op.getSimpleValueType();
20442 SDLoc DL(Op);
20443 SDValue R = Op.getOperand(0);
20444 SDValue Amt = Op.getOperand(1);
20445
20446 assert(VT.isVector() && "Custom lowering only for vector rotates!");
20447 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
20448 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
20449
20450 // XOP has 128-bit vector variable + immediate rotates.
20451 // +ve/-ve Amt = rotate left/right.
20452
20453 // Split 256-bit integers.
20454 if (VT.is256BitVector())
20455 return Lower256IntArith(Op, DAG);
20456
20457 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
20458
20459 // Attempt to rotate by immediate.
20460 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
20461 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
20462 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
20463 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
20464 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
20465 DAG.getConstant(RotateAmt, DL, MVT::i8));
20466 }
20467 }
20468
20469 // Use general rotate by variable (per-element).
20470 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
20471 }
20472
LowerXALUO(SDValue Op,SelectionDAG & DAG)20473 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
20474 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
20475 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
20476 // looks for this combo and may remove the "setcc" instruction if the "setcc"
20477 // has only one use.
20478 SDNode *N = Op.getNode();
20479 SDValue LHS = N->getOperand(0);
20480 SDValue RHS = N->getOperand(1);
20481 unsigned BaseOp = 0;
20482 unsigned Cond = 0;
20483 SDLoc DL(Op);
20484 switch (Op.getOpcode()) {
20485 default: llvm_unreachable("Unknown ovf instruction!");
20486 case ISD::SADDO:
20487 // A subtract of one will be selected as a INC. Note that INC doesn't
20488 // set CF, so we can't do this for UADDO.
20489 if (isOneConstant(RHS)) {
20490 BaseOp = X86ISD::INC;
20491 Cond = X86::COND_O;
20492 break;
20493 }
20494 BaseOp = X86ISD::ADD;
20495 Cond = X86::COND_O;
20496 break;
20497 case ISD::UADDO:
20498 BaseOp = X86ISD::ADD;
20499 Cond = X86::COND_B;
20500 break;
20501 case ISD::SSUBO:
20502 // A subtract of one will be selected as a DEC. Note that DEC doesn't
20503 // set CF, so we can't do this for USUBO.
20504 if (isOneConstant(RHS)) {
20505 BaseOp = X86ISD::DEC;
20506 Cond = X86::COND_O;
20507 break;
20508 }
20509 BaseOp = X86ISD::SUB;
20510 Cond = X86::COND_O;
20511 break;
20512 case ISD::USUBO:
20513 BaseOp = X86ISD::SUB;
20514 Cond = X86::COND_B;
20515 break;
20516 case ISD::SMULO:
20517 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
20518 Cond = X86::COND_O;
20519 break;
20520 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
20521 if (N->getValueType(0) == MVT::i8) {
20522 BaseOp = X86ISD::UMUL8;
20523 Cond = X86::COND_O;
20524 break;
20525 }
20526 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
20527 MVT::i32);
20528 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
20529
20530 SDValue SetCC =
20531 DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20532 DAG.getConstant(X86::COND_O, DL, MVT::i32),
20533 SDValue(Sum.getNode(), 2));
20534
20535 if (N->getValueType(1) == MVT::i1) {
20536 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20537 DAG.getValueType(MVT::i1));
20538 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20539 }
20540 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20541 }
20542 }
20543
20544 // Also sets EFLAGS.
20545 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
20546 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
20547
20548 SDValue SetCC =
20549 DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20550 DAG.getConstant(Cond, DL, MVT::i32),
20551 SDValue(Sum.getNode(), 1));
20552
20553 if (N->getValueType(1) == MVT::i1) {
20554 SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20555 DAG.getValueType(MVT::i1));
20556 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20557 }
20558 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20559 }
20560
20561 /// Returns true if the operand type is exactly twice the native width, and
20562 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
20563 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
20564 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
needsCmpXchgNb(Type * MemType) const20565 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
20566 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
20567
20568 if (OpWidth == 64)
20569 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
20570 else if (OpWidth == 128)
20571 return Subtarget.hasCmpxchg16b();
20572 else
20573 return false;
20574 }
20575
shouldExpandAtomicStoreInIR(StoreInst * SI) const20576 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20577 return needsCmpXchgNb(SI->getValueOperand()->getType());
20578 }
20579
20580 // Note: this turns large loads into lock cmpxchg8b/16b.
20581 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
20582 TargetLowering::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst * LI) const20583 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20584 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
20585 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
20586 : AtomicExpansionKind::None;
20587 }
20588
20589 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * AI) const20590 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
20591 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20592 Type *MemType = AI->getType();
20593
20594 // If the operand is too big, we must see if cmpxchg8/16b is available
20595 // and default to library calls otherwise.
20596 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
20597 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
20598 : AtomicExpansionKind::None;
20599 }
20600
20601 AtomicRMWInst::BinOp Op = AI->getOperation();
20602 switch (Op) {
20603 default:
20604 llvm_unreachable("Unknown atomic operation");
20605 case AtomicRMWInst::Xchg:
20606 case AtomicRMWInst::Add:
20607 case AtomicRMWInst::Sub:
20608 // It's better to use xadd, xsub or xchg for these in all cases.
20609 return AtomicExpansionKind::None;
20610 case AtomicRMWInst::Or:
20611 case AtomicRMWInst::And:
20612 case AtomicRMWInst::Xor:
20613 // If the atomicrmw's result isn't actually used, we can just add a "lock"
20614 // prefix to a normal instruction for these operations.
20615 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
20616 : AtomicExpansionKind::None;
20617 case AtomicRMWInst::Nand:
20618 case AtomicRMWInst::Max:
20619 case AtomicRMWInst::Min:
20620 case AtomicRMWInst::UMax:
20621 case AtomicRMWInst::UMin:
20622 // These always require a non-trivial set of data operations on x86. We must
20623 // use a cmpxchg loop.
20624 return AtomicExpansionKind::CmpXChg;
20625 }
20626 }
20627
20628 LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst * AI) const20629 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
20630 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20631 Type *MemType = AI->getType();
20632 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
20633 // there is no benefit in turning such RMWs into loads, and it is actually
20634 // harmful as it introduces a mfence.
20635 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
20636 return nullptr;
20637
20638 auto Builder = IRBuilder<>(AI);
20639 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20640 auto SynchScope = AI->getSynchScope();
20641 // We must restrict the ordering to avoid generating loads with Release or
20642 // ReleaseAcquire orderings.
20643 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
20644 auto Ptr = AI->getPointerOperand();
20645
20646 // Before the load we need a fence. Here is an example lifted from
20647 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
20648 // is required:
20649 // Thread 0:
20650 // x.store(1, relaxed);
20651 // r1 = y.fetch_add(0, release);
20652 // Thread 1:
20653 // y.fetch_add(42, acquire);
20654 // r2 = x.load(relaxed);
20655 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
20656 // lowered to just a load without a fence. A mfence flushes the store buffer,
20657 // making the optimization clearly correct.
20658 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
20659 // otherwise, we might be able to be more aggressive on relaxed idempotent
20660 // rmw. In practice, they do not look useful, so we don't try to be
20661 // especially clever.
20662 if (SynchScope == SingleThread)
20663 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
20664 // the IR level, so we must wrap it in an intrinsic.
20665 return nullptr;
20666
20667 if (!Subtarget.hasMFence())
20668 // FIXME: it might make sense to use a locked operation here but on a
20669 // different cache-line to prevent cache-line bouncing. In practice it
20670 // is probably a small win, and x86 processors without mfence are rare
20671 // enough that we do not bother.
20672 return nullptr;
20673
20674 Function *MFence =
20675 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
20676 Builder.CreateCall(MFence, {});
20677
20678 // Finally we can emit the atomic load.
20679 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
20680 AI->getType()->getPrimitiveSizeInBits());
20681 Loaded->setAtomic(Order, SynchScope);
20682 AI->replaceAllUsesWith(Loaded);
20683 AI->eraseFromParent();
20684 return Loaded;
20685 }
20686
LowerATOMIC_FENCE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20687 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
20688 SelectionDAG &DAG) {
20689 SDLoc dl(Op);
20690 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
20691 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
20692 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
20693 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
20694
20695 // The only fence that needs an instruction is a sequentially-consistent
20696 // cross-thread fence.
20697 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
20698 FenceScope == CrossThread) {
20699 if (Subtarget.hasMFence())
20700 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
20701
20702 SDValue Chain = Op.getOperand(0);
20703 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
20704 SDValue Ops[] = {
20705 DAG.getRegister(X86::ESP, MVT::i32), // Base
20706 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
20707 DAG.getRegister(0, MVT::i32), // Index
20708 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
20709 DAG.getRegister(0, MVT::i32), // Segment.
20710 Zero,
20711 Chain
20712 };
20713 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
20714 return SDValue(Res, 0);
20715 }
20716
20717 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
20718 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
20719 }
20720
LowerCMP_SWAP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20721 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
20722 SelectionDAG &DAG) {
20723 MVT T = Op.getSimpleValueType();
20724 SDLoc DL(Op);
20725 unsigned Reg = 0;
20726 unsigned size = 0;
20727 switch(T.SimpleTy) {
20728 default: llvm_unreachable("Invalid value type!");
20729 case MVT::i8: Reg = X86::AL; size = 1; break;
20730 case MVT::i16: Reg = X86::AX; size = 2; break;
20731 case MVT::i32: Reg = X86::EAX; size = 4; break;
20732 case MVT::i64:
20733 assert(Subtarget.is64Bit() && "Node not type legal!");
20734 Reg = X86::RAX; size = 8;
20735 break;
20736 }
20737 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
20738 Op.getOperand(2), SDValue());
20739 SDValue Ops[] = { cpIn.getValue(0),
20740 Op.getOperand(1),
20741 Op.getOperand(3),
20742 DAG.getTargetConstant(size, DL, MVT::i8),
20743 cpIn.getValue(1) };
20744 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20745 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
20746 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
20747 Ops, T, MMO);
20748
20749 SDValue cpOut =
20750 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
20751 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
20752 MVT::i32, cpOut.getValue(2));
20753 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
20754 DAG.getConstant(X86::COND_E, DL, MVT::i8),
20755 EFLAGS);
20756
20757 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
20758 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
20759 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
20760 return SDValue();
20761 }
20762
LowerBITCAST(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20763 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
20764 SelectionDAG &DAG) {
20765 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
20766 MVT DstVT = Op.getSimpleValueType();
20767
20768 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
20769 SrcVT == MVT::i64) {
20770 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
20771 if (DstVT != MVT::f64)
20772 // This conversion needs to be expanded.
20773 return SDValue();
20774
20775 SDValue Op0 = Op->getOperand(0);
20776 SmallVector<SDValue, 16> Elts;
20777 SDLoc dl(Op);
20778 unsigned NumElts;
20779 MVT SVT;
20780 if (SrcVT.isVector()) {
20781 NumElts = SrcVT.getVectorNumElements();
20782 SVT = SrcVT.getVectorElementType();
20783
20784 // Widen the vector in input in the case of MVT::v2i32.
20785 // Example: from MVT::v2i32 to MVT::v4i32.
20786 for (unsigned i = 0, e = NumElts; i != e; ++i)
20787 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
20788 DAG.getIntPtrConstant(i, dl)));
20789 } else {
20790 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
20791 "Unexpected source type in LowerBITCAST");
20792 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20793 DAG.getIntPtrConstant(0, dl)));
20794 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20795 DAG.getIntPtrConstant(1, dl)));
20796 NumElts = 2;
20797 SVT = MVT::i32;
20798 }
20799 // Explicitly mark the extra elements as Undef.
20800 Elts.append(NumElts, DAG.getUNDEF(SVT));
20801
20802 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20803 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
20804 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
20805 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
20806 DAG.getIntPtrConstant(0, dl));
20807 }
20808
20809 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
20810 Subtarget.hasMMX() && "Unexpected custom BITCAST");
20811 assert((DstVT == MVT::i64 ||
20812 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
20813 "Unexpected custom BITCAST");
20814 // i64 <=> MMX conversions are Legal.
20815 if (SrcVT==MVT::i64 && DstVT.isVector())
20816 return Op;
20817 if (DstVT==MVT::i64 && SrcVT.isVector())
20818 return Op;
20819 // MMX <=> MMX conversions are Legal.
20820 if (SrcVT.isVector() && DstVT.isVector())
20821 return Op;
20822 // All other conversions need to be expanded.
20823 return SDValue();
20824 }
20825
20826 /// Compute the horizontal sum of bytes in V for the elements of VT.
20827 ///
20828 /// Requires V to be a byte vector and VT to be an integer vector type with
20829 /// wider elements than V's type. The width of the elements of VT determines
20830 /// how many bytes of V are summed horizontally to produce each element of the
20831 /// result.
LowerHorizontalByteSum(SDValue V,MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG)20832 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
20833 const X86Subtarget &Subtarget,
20834 SelectionDAG &DAG) {
20835 SDLoc DL(V);
20836 MVT ByteVecVT = V.getSimpleValueType();
20837 MVT EltVT = VT.getVectorElementType();
20838 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
20839 "Expected value to have byte element type.");
20840 assert(EltVT != MVT::i8 &&
20841 "Horizontal byte sum only makes sense for wider elements!");
20842 unsigned VecSize = VT.getSizeInBits();
20843 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
20844
20845 // PSADBW instruction horizontally add all bytes and leave the result in i64
20846 // chunks, thus directly computes the pop count for v2i64 and v4i64.
20847 if (EltVT == MVT::i64) {
20848 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20849 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20850 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
20851 return DAG.getBitcast(VT, V);
20852 }
20853
20854 if (EltVT == MVT::i32) {
20855 // We unpack the low half and high half into i32s interleaved with zeros so
20856 // that we can use PSADBW to horizontally sum them. The most useful part of
20857 // this is that it lines up the results of two PSADBW instructions to be
20858 // two v2i64 vectors which concatenated are the 4 population counts. We can
20859 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
20860 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
20861 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
20862 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
20863
20864 // Do the horizontal sums into two v2i64s.
20865 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20866 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20867 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20868 DAG.getBitcast(ByteVecVT, Low), Zeros);
20869 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20870 DAG.getBitcast(ByteVecVT, High), Zeros);
20871
20872 // Merge them together.
20873 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
20874 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
20875 DAG.getBitcast(ShortVecVT, Low),
20876 DAG.getBitcast(ShortVecVT, High));
20877
20878 return DAG.getBitcast(VT, V);
20879 }
20880
20881 // The only element type left is i16.
20882 assert(EltVT == MVT::i16 && "Unknown how to handle type");
20883
20884 // To obtain pop count for each i16 element starting from the pop count for
20885 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
20886 // right by 8. It is important to shift as i16s as i8 vector shift isn't
20887 // directly supported.
20888 SDValue ShifterV = DAG.getConstant(8, DL, VT);
20889 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20890 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
20891 DAG.getBitcast(ByteVecVT, V));
20892 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20893 }
20894
LowerVectorCTPOPInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)20895 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
20896 const X86Subtarget &Subtarget,
20897 SelectionDAG &DAG) {
20898 MVT VT = Op.getSimpleValueType();
20899 MVT EltVT = VT.getVectorElementType();
20900 unsigned VecSize = VT.getSizeInBits();
20901
20902 // Implement a lookup table in register by using an algorithm based on:
20903 // http://wm.ite.pl/articles/sse-popcount.html
20904 //
20905 // The general idea is that every lower byte nibble in the input vector is an
20906 // index into a in-register pre-computed pop count table. We then split up the
20907 // input vector in two new ones: (1) a vector with only the shifted-right
20908 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
20909 // masked out higher ones) for each byte. PSHUB is used separately with both
20910 // to index the in-register table. Next, both are added and the result is a
20911 // i8 vector where each element contains the pop count for input byte.
20912 //
20913 // To obtain the pop count for elements != i8, we follow up with the same
20914 // approach and use additional tricks as described below.
20915 //
20916 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
20917 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
20918 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
20919 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
20920
20921 int NumByteElts = VecSize / 8;
20922 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
20923 SDValue In = DAG.getBitcast(ByteVecVT, Op);
20924 SmallVector<SDValue, 64> LUTVec;
20925 for (int i = 0; i < NumByteElts; ++i)
20926 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20927 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
20928 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
20929
20930 // High nibbles
20931 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
20932 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
20933
20934 // Low nibbles
20935 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
20936
20937 // The input vector is used as the shuffle mask that index elements into the
20938 // LUT. After counting low and high nibbles, add the vector to obtain the
20939 // final pop count per i8 element.
20940 SDValue HighPopCnt =
20941 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
20942 SDValue LowPopCnt =
20943 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
20944 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
20945
20946 if (EltVT == MVT::i8)
20947 return PopCnt;
20948
20949 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
20950 }
20951
LowerVectorCTPOPBitmath(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)20952 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
20953 const X86Subtarget &Subtarget,
20954 SelectionDAG &DAG) {
20955 MVT VT = Op.getSimpleValueType();
20956 assert(VT.is128BitVector() &&
20957 "Only 128-bit vector bitmath lowering supported.");
20958
20959 int VecSize = VT.getSizeInBits();
20960 MVT EltVT = VT.getVectorElementType();
20961 int Len = EltVT.getSizeInBits();
20962
20963 // This is the vectorized version of the "best" algorithm from
20964 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
20965 // with a minor tweak to use a series of adds + shifts instead of vector
20966 // multiplications. Implemented for all integer vector types. We only use
20967 // this when we don't have SSSE3 which allows a LUT-based lowering that is
20968 // much faster, even faster than using native popcnt instructions.
20969
20970 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
20971 MVT VT = V.getSimpleValueType();
20972 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
20973 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
20974 };
20975 auto GetMask = [&](SDValue V, APInt Mask) {
20976 MVT VT = V.getSimpleValueType();
20977 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
20978 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
20979 };
20980
20981 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
20982 // x86, so set the SRL type to have elements at least i16 wide. This is
20983 // correct because all of our SRLs are followed immediately by a mask anyways
20984 // that handles any bits that sneak into the high bits of the byte elements.
20985 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
20986
20987 SDValue V = Op;
20988
20989 // v = v - ((v >> 1) & 0x55555555...)
20990 SDValue Srl =
20991 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
20992 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
20993 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
20994
20995 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
20996 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
20997 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
20998 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
20999 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
21000
21001 // v = (v + (v >> 4)) & 0x0F0F0F0F...
21002 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
21003 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
21004 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
21005
21006 // At this point, V contains the byte-wise population count, and we are
21007 // merely doing a horizontal sum if necessary to get the wider element
21008 // counts.
21009 if (EltVT == MVT::i8)
21010 return V;
21011
21012 return LowerHorizontalByteSum(
21013 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
21014 DAG);
21015 }
21016
LowerVectorCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21017 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21018 SelectionDAG &DAG) {
21019 MVT VT = Op.getSimpleValueType();
21020 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
21021 "Unknown CTPOP type to handle");
21022 SDLoc DL(Op.getNode());
21023 SDValue Op0 = Op.getOperand(0);
21024
21025 if (!Subtarget.hasSSSE3()) {
21026 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
21027 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
21028 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
21029 }
21030
21031 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21032 unsigned NumElems = VT.getVectorNumElements();
21033
21034 // Extract each 128-bit vector, compute pop count and concat the result.
21035 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
21036 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
21037
21038 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21039 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21040 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21041 }
21042
21043 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
21044 unsigned NumElems = VT.getVectorNumElements();
21045
21046 // Extract each 256-bit vector, compute pop count and concat the result.
21047 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
21048 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
21049
21050 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21051 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21052 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21053 }
21054
21055 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
21056 }
21057
LowerCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21058 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21059 SelectionDAG &DAG) {
21060 assert(Op.getSimpleValueType().isVector() &&
21061 "We only do custom lowering for vector population count.");
21062 return LowerVectorCTPOP(Op, Subtarget, DAG);
21063 }
21064
LowerBITREVERSE_XOP(SDValue Op,SelectionDAG & DAG)21065 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
21066 MVT VT = Op.getSimpleValueType();
21067 SDValue In = Op.getOperand(0);
21068 SDLoc DL(Op);
21069
21070 // For scalars, its still beneficial to transfer to/from the SIMD unit to
21071 // perform the BITREVERSE.
21072 if (!VT.isVector()) {
21073 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
21074 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
21075 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
21076 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
21077 DAG.getIntPtrConstant(0, DL));
21078 }
21079
21080 MVT SVT = VT.getVectorElementType();
21081 int NumElts = VT.getVectorNumElements();
21082 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
21083
21084 // Decompose 256-bit ops into smaller 128-bit ops.
21085 if (VT.is256BitVector()) {
21086 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21087 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21088
21089 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
21090 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21091 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
21092 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
21093 }
21094
21095 assert(VT.is128BitVector() &&
21096 "Only 128-bit vector bitreverse lowering supported.");
21097
21098 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
21099 // perform the BSWAP in the shuffle.
21100 // Its best to shuffle using the second operand as this will implicitly allow
21101 // memory folding for multiple vectors.
21102 SmallVector<SDValue, 16> MaskElts;
21103 for (int i = 0; i != NumElts; ++i) {
21104 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
21105 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
21106 int PermuteByte = SourceByte | (2 << 5);
21107 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
21108 }
21109 }
21110
21111 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
21112 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
21113 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
21114 Res, Mask);
21115 return DAG.getBitcast(VT, Res);
21116 }
21117
LowerBITREVERSE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21118 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
21119 SelectionDAG &DAG) {
21120 if (Subtarget.hasXOP())
21121 return LowerBITREVERSE_XOP(Op, DAG);
21122
21123 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
21124
21125 MVT VT = Op.getSimpleValueType();
21126 SDValue In = Op.getOperand(0);
21127 SDLoc DL(Op);
21128
21129 unsigned NumElts = VT.getVectorNumElements();
21130 assert(VT.getScalarType() == MVT::i8 &&
21131 "Only byte vector BITREVERSE supported");
21132
21133 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
21134 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21135 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
21136 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21137 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21138 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
21139 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
21140 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21141 }
21142
21143 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
21144 // two nibbles and a PSHUFB lookup to find the bitreverse of each
21145 // 0-15 value (moved to the other nibble).
21146 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
21147 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
21148 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
21149
21150 const int LoLUT[16] = {
21151 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
21152 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
21153 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
21154 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
21155 const int HiLUT[16] = {
21156 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
21157 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
21158 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
21159 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
21160
21161 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
21162 for (unsigned i = 0; i < NumElts; ++i) {
21163 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
21164 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
21165 }
21166
21167 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
21168 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
21169 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
21170 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
21171 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
21172 }
21173
lowerAtomicArithWithLOCK(SDValue N,SelectionDAG & DAG)21174 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
21175 unsigned NewOpc = 0;
21176 switch (N->getOpcode()) {
21177 case ISD::ATOMIC_LOAD_ADD:
21178 NewOpc = X86ISD::LADD;
21179 break;
21180 case ISD::ATOMIC_LOAD_SUB:
21181 NewOpc = X86ISD::LSUB;
21182 break;
21183 case ISD::ATOMIC_LOAD_OR:
21184 NewOpc = X86ISD::LOR;
21185 break;
21186 case ISD::ATOMIC_LOAD_XOR:
21187 NewOpc = X86ISD::LXOR;
21188 break;
21189 case ISD::ATOMIC_LOAD_AND:
21190 NewOpc = X86ISD::LAND;
21191 break;
21192 default:
21193 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
21194 }
21195
21196 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
21197 return DAG.getMemIntrinsicNode(
21198 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
21199 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
21200 /*MemVT=*/N->getSimpleValueType(0), MMO);
21201 }
21202
21203 /// Lower atomic_load_ops into LOCK-prefixed operations.
lowerAtomicArith(SDValue N,SelectionDAG & DAG,const X86Subtarget & Subtarget)21204 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
21205 const X86Subtarget &Subtarget) {
21206 SDValue Chain = N->getOperand(0);
21207 SDValue LHS = N->getOperand(1);
21208 SDValue RHS = N->getOperand(2);
21209 unsigned Opc = N->getOpcode();
21210 MVT VT = N->getSimpleValueType(0);
21211 SDLoc DL(N);
21212
21213 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
21214 // can only be lowered when the result is unused. They should have already
21215 // been transformed into a cmpxchg loop in AtomicExpand.
21216 if (N->hasAnyUseOfValue(0)) {
21217 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
21218 // select LXADD if LOCK_SUB can't be selected.
21219 if (Opc == ISD::ATOMIC_LOAD_SUB) {
21220 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
21221 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
21222 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
21223 RHS, AN->getMemOperand(), AN->getOrdering(),
21224 AN->getSynchScope());
21225 }
21226 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
21227 "Used AtomicRMW ops other than Add should have been expanded!");
21228 return N;
21229 }
21230
21231 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
21232 // RAUW the chain, but don't worry about the result, as it's unused.
21233 assert(!N->hasAnyUseOfValue(0));
21234 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
21235 return SDValue();
21236 }
21237
LowerATOMIC_STORE(SDValue Op,SelectionDAG & DAG)21238 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
21239 SDNode *Node = Op.getNode();
21240 SDLoc dl(Node);
21241 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
21242
21243 // Convert seq_cst store -> xchg
21244 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
21245 // FIXME: On 32-bit, store -> fist or movq would be more efficient
21246 // (The only way to get a 16-byte store is cmpxchg16b)
21247 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
21248 if (cast<AtomicSDNode>(Node)->getOrdering() ==
21249 AtomicOrdering::SequentiallyConsistent ||
21250 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
21251 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
21252 cast<AtomicSDNode>(Node)->getMemoryVT(),
21253 Node->getOperand(0),
21254 Node->getOperand(1), Node->getOperand(2),
21255 cast<AtomicSDNode>(Node)->getMemOperand(),
21256 cast<AtomicSDNode>(Node)->getOrdering(),
21257 cast<AtomicSDNode>(Node)->getSynchScope());
21258 return Swap.getValue(1);
21259 }
21260 // Other atomic stores have a simple pattern.
21261 return Op;
21262 }
21263
LowerADDC_ADDE_SUBC_SUBE(SDValue Op,SelectionDAG & DAG)21264 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
21265 MVT VT = Op.getNode()->getSimpleValueType(0);
21266
21267 // Let legalize expand this if it isn't a legal type yet.
21268 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
21269 return SDValue();
21270
21271 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21272
21273 unsigned Opc;
21274 bool ExtraOp = false;
21275 switch (Op.getOpcode()) {
21276 default: llvm_unreachable("Invalid code");
21277 case ISD::ADDC: Opc = X86ISD::ADD; break;
21278 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
21279 case ISD::SUBC: Opc = X86ISD::SUB; break;
21280 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
21281 }
21282
21283 if (!ExtraOp)
21284 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21285 Op.getOperand(1));
21286 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21287 Op.getOperand(1), Op.getOperand(2));
21288 }
21289
LowerFSINCOS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21290 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
21291 SelectionDAG &DAG) {
21292 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
21293
21294 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
21295 // which returns the values as { float, float } (in XMM0) or
21296 // { double, double } (which is returned in XMM0, XMM1).
21297 SDLoc dl(Op);
21298 SDValue Arg = Op.getOperand(0);
21299 EVT ArgVT = Arg.getValueType();
21300 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21301
21302 TargetLowering::ArgListTy Args;
21303 TargetLowering::ArgListEntry Entry;
21304
21305 Entry.Node = Arg;
21306 Entry.Ty = ArgTy;
21307 Entry.isSExt = false;
21308 Entry.isZExt = false;
21309 Args.push_back(Entry);
21310
21311 bool isF64 = ArgVT == MVT::f64;
21312 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
21313 // the small struct {f32, f32} is returned in (eax, edx). For f64,
21314 // the results are returned via SRet in memory.
21315 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
21316 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21317 SDValue Callee =
21318 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
21319
21320 Type *RetTy = isF64
21321 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
21322 : (Type*)VectorType::get(ArgTy, 4);
21323
21324 TargetLowering::CallLoweringInfo CLI(DAG);
21325 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
21326 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
21327
21328 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
21329
21330 if (isF64)
21331 // Returned in xmm0 and xmm1.
21332 return CallResult.first;
21333
21334 // Returned in bits 0:31 and 32:64 xmm0.
21335 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21336 CallResult.first, DAG.getIntPtrConstant(0, dl));
21337 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21338 CallResult.first, DAG.getIntPtrConstant(1, dl));
21339 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
21340 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
21341 }
21342
21343 /// Widen a vector input to a vector of NVT. The
21344 /// input vector must have the same element type as NVT.
ExtendToType(SDValue InOp,MVT NVT,SelectionDAG & DAG,bool FillWithZeroes=false)21345 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
21346 bool FillWithZeroes = false) {
21347 // Check if InOp already has the right width.
21348 MVT InVT = InOp.getSimpleValueType();
21349 if (InVT == NVT)
21350 return InOp;
21351
21352 if (InOp.isUndef())
21353 return DAG.getUNDEF(NVT);
21354
21355 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
21356 "input and widen element type must match");
21357
21358 unsigned InNumElts = InVT.getVectorNumElements();
21359 unsigned WidenNumElts = NVT.getVectorNumElements();
21360 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
21361 "Unexpected request for vector widening");
21362
21363 EVT EltVT = NVT.getVectorElementType();
21364
21365 SDLoc dl(InOp);
21366 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
21367 InOp.getNumOperands() == 2) {
21368 SDValue N1 = InOp.getOperand(1);
21369 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
21370 N1.isUndef()) {
21371 InOp = InOp.getOperand(0);
21372 InVT = InOp.getSimpleValueType();
21373 InNumElts = InVT.getVectorNumElements();
21374 }
21375 }
21376 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
21377 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
21378 SmallVector<SDValue, 16> Ops;
21379 for (unsigned i = 0; i < InNumElts; ++i)
21380 Ops.push_back(InOp.getOperand(i));
21381
21382 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
21383 DAG.getUNDEF(EltVT);
21384 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
21385 Ops.push_back(FillVal);
21386 return DAG.getBuildVector(NVT, dl, Ops);
21387 }
21388 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
21389 DAG.getUNDEF(NVT);
21390 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
21391 InOp, DAG.getIntPtrConstant(0, dl));
21392 }
21393
LowerMSCATTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21394 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
21395 SelectionDAG &DAG) {
21396 assert(Subtarget.hasAVX512() &&
21397 "MGATHER/MSCATTER are supported on AVX-512 arch only");
21398
21399 // X86 scatter kills mask register, so its type should be added to
21400 // the list of return values.
21401 // If the "scatter" has 2 return values, it is already handled.
21402 if (Op.getNode()->getNumValues() == 2)
21403 return Op;
21404
21405 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
21406 SDValue Src = N->getValue();
21407 MVT VT = Src.getSimpleValueType();
21408 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
21409 SDLoc dl(Op);
21410
21411 SDValue NewScatter;
21412 SDValue Index = N->getIndex();
21413 SDValue Mask = N->getMask();
21414 SDValue Chain = N->getChain();
21415 SDValue BasePtr = N->getBasePtr();
21416 MVT MemVT = N->getMemoryVT().getSimpleVT();
21417 MVT IndexVT = Index.getSimpleValueType();
21418 MVT MaskVT = Mask.getSimpleValueType();
21419
21420 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
21421 // The v2i32 value was promoted to v2i64.
21422 // Now we "redo" the type legalizer's work and widen the original
21423 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
21424 // with a shuffle.
21425 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
21426 "Unexpected memory type");
21427 int ShuffleMask[] = {0, 2, -1, -1};
21428 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
21429 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
21430 // Now we have 4 elements instead of 2.
21431 // Expand the index.
21432 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
21433 Index = ExtendToType(Index, NewIndexVT, DAG);
21434
21435 // Expand the mask with zeroes
21436 // Mask may be <2 x i64> or <2 x i1> at this moment
21437 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
21438 "Unexpected mask type");
21439 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
21440 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21441 VT = MVT::v4i32;
21442 }
21443
21444 unsigned NumElts = VT.getVectorNumElements();
21445 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21446 !Index.getSimpleValueType().is512BitVector()) {
21447 // AVX512F supports only 512-bit vectors. Or data or index should
21448 // be 512 bit wide. If now the both index and data are 256-bit, but
21449 // the vector contains 8 elements, we just sign-extend the index
21450 if (IndexVT == MVT::v8i32)
21451 // Just extend index
21452 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21453 else {
21454 // The minimal number of elts in scatter is 8
21455 NumElts = 8;
21456 // Index
21457 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21458 // Use original index here, do not modify the index twice
21459 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
21460 if (IndexVT.getScalarType() == MVT::i32)
21461 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21462
21463 // Mask
21464 // At this point we have promoted mask operand
21465 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21466 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21467 // Use the original mask here, do not modify the mask twice
21468 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
21469
21470 // The value that should be stored
21471 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21472 Src = ExtendToType(Src, NewVT, DAG);
21473 }
21474 }
21475 // If the mask is "wide" at this point - truncate it to i1 vector
21476 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
21477 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
21478
21479 // The mask is killed by scatter, add it to the values
21480 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
21481 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
21482 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
21483 N->getMemOperand());
21484 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
21485 return SDValue(NewScatter.getNode(), 1);
21486 }
21487
LowerMLOAD(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21488 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
21489 SelectionDAG &DAG) {
21490
21491 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
21492 MVT VT = Op.getSimpleValueType();
21493 MVT ScalarVT = VT.getScalarType();
21494 SDValue Mask = N->getMask();
21495 SDLoc dl(Op);
21496
21497 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21498 "Cannot lower masked load op.");
21499
21500 assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21501 (Subtarget.hasBWI() &&
21502 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21503 "Unsupported masked load op.");
21504
21505 // This operation is legal for targets with VLX, but without
21506 // VLX the vector should be widened to 512 bit
21507 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21508 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21509 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21510 SDValue Src0 = N->getSrc0();
21511 Src0 = ExtendToType(Src0, WideDataVT, DAG);
21512 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21513 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
21514 N->getBasePtr(), Mask, Src0,
21515 N->getMemoryVT(), N->getMemOperand(),
21516 N->getExtensionType());
21517
21518 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21519 NewLoad.getValue(0),
21520 DAG.getIntPtrConstant(0, dl));
21521 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
21522 return DAG.getMergeValues(RetOps, dl);
21523 }
21524
LowerMSTORE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21525 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
21526 SelectionDAG &DAG) {
21527 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
21528 SDValue DataToStore = N->getValue();
21529 MVT VT = DataToStore.getSimpleValueType();
21530 MVT ScalarVT = VT.getScalarType();
21531 SDValue Mask = N->getMask();
21532 SDLoc dl(Op);
21533
21534 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21535 "Cannot lower masked store op.");
21536
21537 assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21538 (Subtarget.hasBWI() &&
21539 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21540 "Unsupported masked store op.");
21541
21542 // This operation is legal for targets with VLX, but without
21543 // VLX the vector should be widened to 512 bit
21544 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21545 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21546 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21547 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
21548 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21549 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
21550 Mask, N->getMemoryVT(), N->getMemOperand(),
21551 N->isTruncatingStore());
21552 }
21553
LowerMGATHER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21554 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
21555 SelectionDAG &DAG) {
21556 assert(Subtarget.hasAVX512() &&
21557 "MGATHER/MSCATTER are supported on AVX-512 arch only");
21558
21559 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
21560 SDLoc dl(Op);
21561 MVT VT = Op.getSimpleValueType();
21562 SDValue Index = N->getIndex();
21563 SDValue Mask = N->getMask();
21564 SDValue Src0 = N->getValue();
21565 MVT IndexVT = Index.getSimpleValueType();
21566 MVT MaskVT = Mask.getSimpleValueType();
21567
21568 unsigned NumElts = VT.getVectorNumElements();
21569 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
21570
21571 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21572 !Index.getSimpleValueType().is512BitVector()) {
21573 // AVX512F supports only 512-bit vectors. Or data or index should
21574 // be 512 bit wide. If now the both index and data are 256-bit, but
21575 // the vector contains 8 elements, we just sign-extend the index
21576 if (NumElts == 8) {
21577 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21578 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
21579 N->getOperand(3), Index };
21580 DAG.UpdateNodeOperands(N, Ops);
21581 return Op;
21582 }
21583
21584 // Minimal number of elements in Gather
21585 NumElts = 8;
21586 // Index
21587 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21588 Index = ExtendToType(Index, NewIndexVT, DAG);
21589 if (IndexVT.getScalarType() == MVT::i32)
21590 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21591
21592 // Mask
21593 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
21594 // At this point we have promoted mask operand
21595 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21596 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21597 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21598 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
21599
21600 // The pass-thru value
21601 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21602 Src0 = ExtendToType(Src0, NewVT, DAG);
21603
21604 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
21605 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
21606 N->getMemoryVT(), dl, Ops,
21607 N->getMemOperand());
21608 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21609 NewGather.getValue(0),
21610 DAG.getIntPtrConstant(0, dl));
21611 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
21612 return DAG.getMergeValues(RetOps, dl);
21613 }
21614 return Op;
21615 }
21616
LowerGC_TRANSITION_START(SDValue Op,SelectionDAG & DAG) const21617 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
21618 SelectionDAG &DAG) const {
21619 // TODO: Eventually, the lowering of these nodes should be informed by or
21620 // deferred to the GC strategy for the function in which they appear. For
21621 // now, however, they must be lowered to something. Since they are logically
21622 // no-ops in the case of a null GC strategy (or a GC strategy which does not
21623 // require special handling for these nodes), lower them as literal NOOPs for
21624 // the time being.
21625 SmallVector<SDValue, 2> Ops;
21626
21627 Ops.push_back(Op.getOperand(0));
21628 if (Op->getGluedNode())
21629 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21630
21631 SDLoc OpDL(Op);
21632 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21633 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21634
21635 return NOOP;
21636 }
21637
LowerGC_TRANSITION_END(SDValue Op,SelectionDAG & DAG) const21638 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
21639 SelectionDAG &DAG) const {
21640 // TODO: Eventually, the lowering of these nodes should be informed by or
21641 // deferred to the GC strategy for the function in which they appear. For
21642 // now, however, they must be lowered to something. Since they are logically
21643 // no-ops in the case of a null GC strategy (or a GC strategy which does not
21644 // require special handling for these nodes), lower them as literal NOOPs for
21645 // the time being.
21646 SmallVector<SDValue, 2> Ops;
21647
21648 Ops.push_back(Op.getOperand(0));
21649 if (Op->getGluedNode())
21650 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21651
21652 SDLoc OpDL(Op);
21653 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21654 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21655
21656 return NOOP;
21657 }
21658
21659 /// Provide custom lowering hooks for some operations.
LowerOperation(SDValue Op,SelectionDAG & DAG) const21660 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
21661 switch (Op.getOpcode()) {
21662 default: llvm_unreachable("Should not custom lower this!");
21663 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
21664 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
21665 return LowerCMP_SWAP(Op, Subtarget, DAG);
21666 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
21667 case ISD::ATOMIC_LOAD_ADD:
21668 case ISD::ATOMIC_LOAD_SUB:
21669 case ISD::ATOMIC_LOAD_OR:
21670 case ISD::ATOMIC_LOAD_XOR:
21671 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
21672 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
21673 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
21674 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
21675 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
21676 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
21677 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
21678 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
21679 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
21680 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
21681 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
21682 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
21683 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
21684 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
21685 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
21686 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
21687 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
21688 case ISD::SHL_PARTS:
21689 case ISD::SRA_PARTS:
21690 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
21691 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
21692 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
21693 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
21694 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
21695 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
21696 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
21697 case ISD::SIGN_EXTEND_VECTOR_INREG:
21698 return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
21699 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
21700 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
21701 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
21702 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
21703 case ISD::FABS:
21704 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
21705 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
21706 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
21707 case ISD::SETCC: return LowerSETCC(Op, DAG);
21708 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
21709 case ISD::SELECT: return LowerSELECT(Op, DAG);
21710 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
21711 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
21712 case ISD::VASTART: return LowerVASTART(Op, DAG);
21713 case ISD::VAARG: return LowerVAARG(Op, DAG);
21714 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
21715 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
21716 case ISD::INTRINSIC_VOID:
21717 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
21718 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
21719 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
21720 case ISD::FRAME_TO_ARGS_OFFSET:
21721 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
21722 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
21723 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
21724 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
21725 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
21726 case ISD::EH_SJLJ_SETUP_DISPATCH:
21727 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
21728 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
21729 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
21730 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
21731 case ISD::CTLZ:
21732 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
21733 case ISD::CTTZ:
21734 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
21735 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
21736 case ISD::MULHS:
21737 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
21738 case ISD::UMUL_LOHI:
21739 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
21740 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
21741 case ISD::SRA:
21742 case ISD::SRL:
21743 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
21744 case ISD::SADDO:
21745 case ISD::UADDO:
21746 case ISD::SSUBO:
21747 case ISD::USUBO:
21748 case ISD::SMULO:
21749 case ISD::UMULO: return LowerXALUO(Op, DAG);
21750 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
21751 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
21752 case ISD::ADDC:
21753 case ISD::ADDE:
21754 case ISD::SUBC:
21755 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
21756 case ISD::ADD: return LowerADD(Op, DAG);
21757 case ISD::SUB: return LowerSUB(Op, DAG);
21758 case ISD::SMAX:
21759 case ISD::SMIN:
21760 case ISD::UMAX:
21761 case ISD::UMIN: return LowerMINMAX(Op, DAG);
21762 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
21763 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
21764 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
21765 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
21766 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
21767 case ISD::GC_TRANSITION_START:
21768 return LowerGC_TRANSITION_START(Op, DAG);
21769 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
21770 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
21771 }
21772 }
21773
21774 /// Places new result values for the node in Results (their number
21775 /// and types must exactly match those of the original return values of
21776 /// the node), or leaves Results empty, which indicates that the node is not
21777 /// to be custom lowered after all.
LowerOperationWrapper(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const21778 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
21779 SmallVectorImpl<SDValue> &Results,
21780 SelectionDAG &DAG) const {
21781 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
21782
21783 if (!Res.getNode())
21784 return;
21785
21786 assert((N->getNumValues() <= Res->getNumValues()) &&
21787 "Lowering returned the wrong number of results!");
21788
21789 // Places new result values base on N result number.
21790 // In some cases (LowerSINT_TO_FP for example) Res has more result values
21791 // than original node, chain should be dropped(last value).
21792 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
21793 Results.push_back(Res.getValue(I));
21794 }
21795
21796 /// Replace a node with an illegal result type with a new node built out of
21797 /// custom code.
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const21798 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
21799 SmallVectorImpl<SDValue>&Results,
21800 SelectionDAG &DAG) const {
21801 SDLoc dl(N);
21802 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21803 switch (N->getOpcode()) {
21804 default:
21805 llvm_unreachable("Do not know how to custom type legalize this operation!");
21806 case X86ISD::AVG: {
21807 // Legalize types for X86ISD::AVG by expanding vectors.
21808 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21809
21810 auto InVT = N->getValueType(0);
21811 auto InVTSize = InVT.getSizeInBits();
21812 const unsigned RegSize =
21813 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
21814 assert((!Subtarget.hasAVX512() || RegSize < 512) &&
21815 "512-bit vector requires AVX512");
21816 assert((!Subtarget.hasAVX2() || RegSize < 256) &&
21817 "256-bit vector requires AVX2");
21818
21819 auto ElemVT = InVT.getVectorElementType();
21820 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
21821 RegSize / ElemVT.getSizeInBits());
21822 assert(RegSize % InVT.getSizeInBits() == 0);
21823 unsigned NumConcat = RegSize / InVT.getSizeInBits();
21824
21825 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
21826 Ops[0] = N->getOperand(0);
21827 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21828 Ops[0] = N->getOperand(1);
21829 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21830
21831 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
21832 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
21833 DAG.getIntPtrConstant(0, dl)));
21834 return;
21835 }
21836 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
21837 case X86ISD::FMINC:
21838 case X86ISD::FMIN:
21839 case X86ISD::FMAXC:
21840 case X86ISD::FMAX: {
21841 EVT VT = N->getValueType(0);
21842 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
21843 SDValue UNDEF = DAG.getUNDEF(VT);
21844 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21845 N->getOperand(0), UNDEF);
21846 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21847 N->getOperand(1), UNDEF);
21848 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
21849 return;
21850 }
21851 case ISD::SIGN_EXTEND_INREG:
21852 case ISD::ADDC:
21853 case ISD::ADDE:
21854 case ISD::SUBC:
21855 case ISD::SUBE:
21856 // We don't want to expand or promote these.
21857 return;
21858 case ISD::SDIV:
21859 case ISD::UDIV:
21860 case ISD::SREM:
21861 case ISD::UREM:
21862 case ISD::SDIVREM:
21863 case ISD::UDIVREM: {
21864 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
21865 Results.push_back(V);
21866 return;
21867 }
21868 case ISD::FP_TO_SINT:
21869 case ISD::FP_TO_UINT: {
21870 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
21871
21872 std::pair<SDValue,SDValue> Vals =
21873 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
21874 SDValue FIST = Vals.first, StackSlot = Vals.second;
21875 if (FIST.getNode()) {
21876 EVT VT = N->getValueType(0);
21877 // Return a load from the stack slot.
21878 if (StackSlot.getNode())
21879 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
21880 MachinePointerInfo(),
21881 false, false, false, 0));
21882 else
21883 Results.push_back(FIST);
21884 }
21885 return;
21886 }
21887 case ISD::UINT_TO_FP: {
21888 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21889 if (N->getOperand(0).getValueType() != MVT::v2i32 ||
21890 N->getValueType(0) != MVT::v2f32)
21891 return;
21892 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
21893 N->getOperand(0));
21894 SDValue VBias =
21895 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
21896 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
21897 DAG.getBitcast(MVT::v2i64, VBias));
21898 Or = DAG.getBitcast(MVT::v2f64, Or);
21899 // TODO: Are there any fast-math-flags to propagate here?
21900 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
21901 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
21902 return;
21903 }
21904 case ISD::FP_ROUND: {
21905 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
21906 return;
21907 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
21908 Results.push_back(V);
21909 return;
21910 }
21911 case ISD::FP_EXTEND: {
21912 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
21913 // No other ValueType for FP_EXTEND should reach this point.
21914 assert(N->getValueType(0) == MVT::v2f32 &&
21915 "Do not know how to legalize this Node");
21916 return;
21917 }
21918 case ISD::INTRINSIC_W_CHAIN: {
21919 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
21920 switch (IntNo) {
21921 default : llvm_unreachable("Do not know how to custom type "
21922 "legalize this intrinsic operation!");
21923 case Intrinsic::x86_rdtsc:
21924 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21925 Results);
21926 case Intrinsic::x86_rdtscp:
21927 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
21928 Results);
21929 case Intrinsic::x86_rdpmc:
21930 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
21931 }
21932 }
21933 case ISD::INTRINSIC_WO_CHAIN: {
21934 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
21935 Results.push_back(V);
21936 return;
21937 }
21938 case ISD::READCYCLECOUNTER: {
21939 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21940 Results);
21941 }
21942 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
21943 EVT T = N->getValueType(0);
21944 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
21945 bool Regs64bit = T == MVT::i128;
21946 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
21947 SDValue cpInL, cpInH;
21948 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21949 DAG.getConstant(0, dl, HalfT));
21950 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21951 DAG.getConstant(1, dl, HalfT));
21952 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
21953 Regs64bit ? X86::RAX : X86::EAX,
21954 cpInL, SDValue());
21955 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
21956 Regs64bit ? X86::RDX : X86::EDX,
21957 cpInH, cpInL.getValue(1));
21958 SDValue swapInL, swapInH;
21959 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21960 DAG.getConstant(0, dl, HalfT));
21961 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21962 DAG.getConstant(1, dl, HalfT));
21963 swapInH =
21964 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
21965 swapInH, cpInH.getValue(1));
21966 // If the current function needs the base pointer, RBX,
21967 // we shouldn't use cmpxchg directly.
21968 // Indeed the lowering of that instruction will clobber
21969 // that register and since RBX will be a reserved register
21970 // the register allocator will not make sure its value will
21971 // be properly saved and restored around this live-range.
21972 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
21973 SDValue Result;
21974 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21975 unsigned BasePtr = TRI->getBaseRegister();
21976 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
21977 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
21978 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
21979 // ISel prefers the LCMPXCHG64 variant.
21980 // If that assert breaks, that means it is not the case anymore,
21981 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
21982 // not just EBX. This is a matter of accepting i64 input for that
21983 // pseudo, and restoring into the register of the right wide
21984 // in expand pseudo. Everything else should just work.
21985 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
21986 "Saving only half of the RBX");
21987 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
21988 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
21989 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
21990 Regs64bit ? X86::RBX : X86::EBX,
21991 HalfT, swapInH.getValue(1));
21992 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
21993 RBXSave,
21994 /*Glue*/ RBXSave.getValue(2)};
21995 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
21996 } else {
21997 unsigned Opcode =
21998 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
21999 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
22000 Regs64bit ? X86::RBX : X86::EBX, swapInL,
22001 swapInH.getValue(1));
22002 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
22003 swapInL.getValue(1)};
22004 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
22005 }
22006 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
22007 Regs64bit ? X86::RAX : X86::EAX,
22008 HalfT, Result.getValue(1));
22009 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
22010 Regs64bit ? X86::RDX : X86::EDX,
22011 HalfT, cpOutL.getValue(2));
22012 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
22013
22014 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
22015 MVT::i32, cpOutH.getValue(2));
22016 SDValue Success =
22017 DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22018 DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
22019 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
22020
22021 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
22022 Results.push_back(Success);
22023 Results.push_back(EFLAGS.getValue(1));
22024 return;
22025 }
22026 case ISD::ATOMIC_SWAP:
22027 case ISD::ATOMIC_LOAD_ADD:
22028 case ISD::ATOMIC_LOAD_SUB:
22029 case ISD::ATOMIC_LOAD_AND:
22030 case ISD::ATOMIC_LOAD_OR:
22031 case ISD::ATOMIC_LOAD_XOR:
22032 case ISD::ATOMIC_LOAD_NAND:
22033 case ISD::ATOMIC_LOAD_MIN:
22034 case ISD::ATOMIC_LOAD_MAX:
22035 case ISD::ATOMIC_LOAD_UMIN:
22036 case ISD::ATOMIC_LOAD_UMAX:
22037 case ISD::ATOMIC_LOAD: {
22038 // Delegate to generic TypeLegalization. Situations we can really handle
22039 // should have already been dealt with by AtomicExpandPass.cpp.
22040 break;
22041 }
22042 case ISD::BITCAST: {
22043 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22044 EVT DstVT = N->getValueType(0);
22045 EVT SrcVT = N->getOperand(0)->getValueType(0);
22046
22047 if (SrcVT != MVT::f64 ||
22048 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
22049 return;
22050
22051 unsigned NumElts = DstVT.getVectorNumElements();
22052 EVT SVT = DstVT.getVectorElementType();
22053 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22054 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
22055 MVT::v2f64, N->getOperand(0));
22056 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
22057
22058 if (ExperimentalVectorWideningLegalization) {
22059 // If we are legalizing vectors by widening, we already have the desired
22060 // legal vector type, just return it.
22061 Results.push_back(ToVecInt);
22062 return;
22063 }
22064
22065 SmallVector<SDValue, 8> Elts;
22066 for (unsigned i = 0, e = NumElts; i != e; ++i)
22067 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
22068 ToVecInt, DAG.getIntPtrConstant(i, dl)));
22069
22070 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
22071 }
22072 }
22073 }
22074
getTargetNodeName(unsigned Opcode) const22075 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
22076 switch ((X86ISD::NodeType)Opcode) {
22077 case X86ISD::FIRST_NUMBER: break;
22078 case X86ISD::BSF: return "X86ISD::BSF";
22079 case X86ISD::BSR: return "X86ISD::BSR";
22080 case X86ISD::SHLD: return "X86ISD::SHLD";
22081 case X86ISD::SHRD: return "X86ISD::SHRD";
22082 case X86ISD::FAND: return "X86ISD::FAND";
22083 case X86ISD::FANDN: return "X86ISD::FANDN";
22084 case X86ISD::FOR: return "X86ISD::FOR";
22085 case X86ISD::FXOR: return "X86ISD::FXOR";
22086 case X86ISD::FILD: return "X86ISD::FILD";
22087 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
22088 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
22089 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
22090 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
22091 case X86ISD::FLD: return "X86ISD::FLD";
22092 case X86ISD::FST: return "X86ISD::FST";
22093 case X86ISD::CALL: return "X86ISD::CALL";
22094 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
22095 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
22096 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
22097 case X86ISD::BT: return "X86ISD::BT";
22098 case X86ISD::CMP: return "X86ISD::CMP";
22099 case X86ISD::COMI: return "X86ISD::COMI";
22100 case X86ISD::UCOMI: return "X86ISD::UCOMI";
22101 case X86ISD::CMPM: return "X86ISD::CMPM";
22102 case X86ISD::CMPMU: return "X86ISD::CMPMU";
22103 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
22104 case X86ISD::SETCC: return "X86ISD::SETCC";
22105 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
22106 case X86ISD::FSETCC: return "X86ISD::FSETCC";
22107 case X86ISD::CMOV: return "X86ISD::CMOV";
22108 case X86ISD::BRCOND: return "X86ISD::BRCOND";
22109 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
22110 case X86ISD::IRET: return "X86ISD::IRET";
22111 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
22112 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
22113 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
22114 case X86ISD::Wrapper: return "X86ISD::Wrapper";
22115 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
22116 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
22117 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
22118 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
22119 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
22120 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
22121 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
22122 case X86ISD::PINSRB: return "X86ISD::PINSRB";
22123 case X86ISD::PINSRW: return "X86ISD::PINSRW";
22124 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
22125 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
22126 case X86ISD::ANDNP: return "X86ISD::ANDNP";
22127 case X86ISD::BLENDI: return "X86ISD::BLENDI";
22128 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
22129 case X86ISD::ADDUS: return "X86ISD::ADDUS";
22130 case X86ISD::SUBUS: return "X86ISD::SUBUS";
22131 case X86ISD::HADD: return "X86ISD::HADD";
22132 case X86ISD::HSUB: return "X86ISD::HSUB";
22133 case X86ISD::FHADD: return "X86ISD::FHADD";
22134 case X86ISD::FHSUB: return "X86ISD::FHSUB";
22135 case X86ISD::ABS: return "X86ISD::ABS";
22136 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
22137 case X86ISD::FMAX: return "X86ISD::FMAX";
22138 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
22139 case X86ISD::FMIN: return "X86ISD::FMIN";
22140 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
22141 case X86ISD::FMAXC: return "X86ISD::FMAXC";
22142 case X86ISD::FMINC: return "X86ISD::FMINC";
22143 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
22144 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
22145 case X86ISD::FRCP: return "X86ISD::FRCP";
22146 case X86ISD::FRCPS: return "X86ISD::FRCPS";
22147 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
22148 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
22149 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
22150 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
22151 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
22152 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
22153 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
22154 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
22155 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
22156 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
22157 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
22158 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
22159 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
22160 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
22161 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
22162 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
22163 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
22164 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
22165 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
22166 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
22167 case X86ISD::LADD: return "X86ISD::LADD";
22168 case X86ISD::LSUB: return "X86ISD::LSUB";
22169 case X86ISD::LOR: return "X86ISD::LOR";
22170 case X86ISD::LXOR: return "X86ISD::LXOR";
22171 case X86ISD::LAND: return "X86ISD::LAND";
22172 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
22173 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
22174 case X86ISD::VZEXT: return "X86ISD::VZEXT";
22175 case X86ISD::VSEXT: return "X86ISD::VSEXT";
22176 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
22177 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
22178 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
22179 case X86ISD::VINSERT: return "X86ISD::VINSERT";
22180 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
22181 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
22182 case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD";
22183 case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD";
22184 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
22185 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
22186 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
22187 case X86ISD::VSHL: return "X86ISD::VSHL";
22188 case X86ISD::VSRL: return "X86ISD::VSRL";
22189 case X86ISD::VSRA: return "X86ISD::VSRA";
22190 case X86ISD::VSHLI: return "X86ISD::VSHLI";
22191 case X86ISD::VSRLI: return "X86ISD::VSRLI";
22192 case X86ISD::VSRAI: return "X86ISD::VSRAI";
22193 case X86ISD::VSRAV: return "X86ISD::VSRAV";
22194 case X86ISD::VROTLI: return "X86ISD::VROTLI";
22195 case X86ISD::VROTRI: return "X86ISD::VROTRI";
22196 case X86ISD::VPPERM: return "X86ISD::VPPERM";
22197 case X86ISD::CMPP: return "X86ISD::CMPP";
22198 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
22199 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
22200 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
22201 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
22202 case X86ISD::ADD: return "X86ISD::ADD";
22203 case X86ISD::SUB: return "X86ISD::SUB";
22204 case X86ISD::ADC: return "X86ISD::ADC";
22205 case X86ISD::SBB: return "X86ISD::SBB";
22206 case X86ISD::SMUL: return "X86ISD::SMUL";
22207 case X86ISD::UMUL: return "X86ISD::UMUL";
22208 case X86ISD::SMUL8: return "X86ISD::SMUL8";
22209 case X86ISD::UMUL8: return "X86ISD::UMUL8";
22210 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
22211 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
22212 case X86ISD::INC: return "X86ISD::INC";
22213 case X86ISD::DEC: return "X86ISD::DEC";
22214 case X86ISD::OR: return "X86ISD::OR";
22215 case X86ISD::XOR: return "X86ISD::XOR";
22216 case X86ISD::AND: return "X86ISD::AND";
22217 case X86ISD::BEXTR: return "X86ISD::BEXTR";
22218 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
22219 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
22220 case X86ISD::PTEST: return "X86ISD::PTEST";
22221 case X86ISD::TESTP: return "X86ISD::TESTP";
22222 case X86ISD::TESTM: return "X86ISD::TESTM";
22223 case X86ISD::TESTNM: return "X86ISD::TESTNM";
22224 case X86ISD::KORTEST: return "X86ISD::KORTEST";
22225 case X86ISD::KTEST: return "X86ISD::KTEST";
22226 case X86ISD::PACKSS: return "X86ISD::PACKSS";
22227 case X86ISD::PACKUS: return "X86ISD::PACKUS";
22228 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
22229 case X86ISD::VALIGN: return "X86ISD::VALIGN";
22230 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
22231 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
22232 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
22233 case X86ISD::SHUFP: return "X86ISD::SHUFP";
22234 case X86ISD::SHUF128: return "X86ISD::SHUF128";
22235 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
22236 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
22237 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
22238 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
22239 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
22240 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
22241 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
22242 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
22243 case X86ISD::MOVSD: return "X86ISD::MOVSD";
22244 case X86ISD::MOVSS: return "X86ISD::MOVSS";
22245 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
22246 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
22247 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
22248 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
22249 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
22250 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
22251 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
22252 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
22253 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
22254 case X86ISD::VPERMV: return "X86ISD::VPERMV";
22255 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
22256 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
22257 case X86ISD::VPERMI: return "X86ISD::VPERMI";
22258 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
22259 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
22260 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
22261 case X86ISD::VRANGE: return "X86ISD::VRANGE";
22262 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
22263 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
22264 case X86ISD::PSADBW: return "X86ISD::PSADBW";
22265 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
22266 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
22267 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
22268 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
22269 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
22270 case X86ISD::MFENCE: return "X86ISD::MFENCE";
22271 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
22272 case X86ISD::SAHF: return "X86ISD::SAHF";
22273 case X86ISD::RDRAND: return "X86ISD::RDRAND";
22274 case X86ISD::RDSEED: return "X86ISD::RDSEED";
22275 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
22276 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
22277 case X86ISD::VPROT: return "X86ISD::VPROT";
22278 case X86ISD::VPROTI: return "X86ISD::VPROTI";
22279 case X86ISD::VPSHA: return "X86ISD::VPSHA";
22280 case X86ISD::VPSHL: return "X86ISD::VPSHL";
22281 case X86ISD::VPCOM: return "X86ISD::VPCOM";
22282 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
22283 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
22284 case X86ISD::FMADD: return "X86ISD::FMADD";
22285 case X86ISD::FMSUB: return "X86ISD::FMSUB";
22286 case X86ISD::FNMADD: return "X86ISD::FNMADD";
22287 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
22288 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
22289 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
22290 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
22291 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
22292 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
22293 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
22294 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
22295 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
22296 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
22297 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
22298 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
22299 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
22300 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
22301 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
22302 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
22303 case X86ISD::XTEST: return "X86ISD::XTEST";
22304 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
22305 case X86ISD::EXPAND: return "X86ISD::EXPAND";
22306 case X86ISD::SELECT: return "X86ISD::SELECT";
22307 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
22308 case X86ISD::RCP28: return "X86ISD::RCP28";
22309 case X86ISD::EXP2: return "X86ISD::EXP2";
22310 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
22311 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
22312 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
22313 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
22314 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
22315 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
22316 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
22317 case X86ISD::SCALEF: return "X86ISD::SCALEF";
22318 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
22319 case X86ISD::ADDS: return "X86ISD::ADDS";
22320 case X86ISD::SUBS: return "X86ISD::SUBS";
22321 case X86ISD::AVG: return "X86ISD::AVG";
22322 case X86ISD::MULHRS: return "X86ISD::MULHRS";
22323 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
22324 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
22325 case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND";
22326 case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND";
22327 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
22328 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
22329 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
22330 case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
22331 case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
22332 }
22333 return nullptr;
22334 }
22335
22336 /// Return true if the addressing mode represented by AM is legal for this
22337 /// target, for a load/store of the specified type.
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS) const22338 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
22339 const AddrMode &AM, Type *Ty,
22340 unsigned AS) const {
22341 // X86 supports extremely general addressing modes.
22342 CodeModel::Model M = getTargetMachine().getCodeModel();
22343
22344 // X86 allows a sign-extended 32-bit immediate field as a displacement.
22345 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
22346 return false;
22347
22348 if (AM.BaseGV) {
22349 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
22350
22351 // If a reference to this global requires an extra load, we can't fold it.
22352 if (isGlobalStubReference(GVFlags))
22353 return false;
22354
22355 // If BaseGV requires a register for the PIC base, we cannot also have a
22356 // BaseReg specified.
22357 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
22358 return false;
22359
22360 // If lower 4G is not available, then we must use rip-relative addressing.
22361 if ((M != CodeModel::Small || isPositionIndependent()) &&
22362 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
22363 return false;
22364 }
22365
22366 switch (AM.Scale) {
22367 case 0:
22368 case 1:
22369 case 2:
22370 case 4:
22371 case 8:
22372 // These scales always work.
22373 break;
22374 case 3:
22375 case 5:
22376 case 9:
22377 // These scales are formed with basereg+scalereg. Only accept if there is
22378 // no basereg yet.
22379 if (AM.HasBaseReg)
22380 return false;
22381 break;
22382 default: // Other stuff never works.
22383 return false;
22384 }
22385
22386 return true;
22387 }
22388
isVectorShiftByScalarCheap(Type * Ty) const22389 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
22390 unsigned Bits = Ty->getScalarSizeInBits();
22391
22392 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
22393 // particularly cheaper than those without.
22394 if (Bits == 8)
22395 return false;
22396
22397 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
22398 // variable shifts just as cheap as scalar ones.
22399 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
22400 return false;
22401
22402 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
22403 // fully general vector.
22404 return true;
22405 }
22406
isTruncateFree(Type * Ty1,Type * Ty2) const22407 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
22408 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22409 return false;
22410 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
22411 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
22412 return NumBits1 > NumBits2;
22413 }
22414
allowTruncateForTailCall(Type * Ty1,Type * Ty2) const22415 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
22416 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22417 return false;
22418
22419 if (!isTypeLegal(EVT::getEVT(Ty1)))
22420 return false;
22421
22422 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
22423
22424 // Assuming the caller doesn't have a zeroext or signext return parameter,
22425 // truncation all the way down to i1 is valid.
22426 return true;
22427 }
22428
isLegalICmpImmediate(int64_t Imm) const22429 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
22430 return isInt<32>(Imm);
22431 }
22432
isLegalAddImmediate(int64_t Imm) const22433 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
22434 // Can also use sub to handle negated immediates.
22435 return isInt<32>(Imm);
22436 }
22437
isTruncateFree(EVT VT1,EVT VT2) const22438 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
22439 if (!VT1.isInteger() || !VT2.isInteger())
22440 return false;
22441 unsigned NumBits1 = VT1.getSizeInBits();
22442 unsigned NumBits2 = VT2.getSizeInBits();
22443 return NumBits1 > NumBits2;
22444 }
22445
isZExtFree(Type * Ty1,Type * Ty2) const22446 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
22447 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22448 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
22449 }
22450
isZExtFree(EVT VT1,EVT VT2) const22451 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
22452 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22453 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
22454 }
22455
isZExtFree(SDValue Val,EVT VT2) const22456 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
22457 EVT VT1 = Val.getValueType();
22458 if (isZExtFree(VT1, VT2))
22459 return true;
22460
22461 if (Val.getOpcode() != ISD::LOAD)
22462 return false;
22463
22464 if (!VT1.isSimple() || !VT1.isInteger() ||
22465 !VT2.isSimple() || !VT2.isInteger())
22466 return false;
22467
22468 switch (VT1.getSimpleVT().SimpleTy) {
22469 default: break;
22470 case MVT::i8:
22471 case MVT::i16:
22472 case MVT::i32:
22473 // X86 has 8, 16, and 32-bit zero-extending loads.
22474 return true;
22475 }
22476
22477 return false;
22478 }
22479
isVectorLoadExtDesirable(SDValue) const22480 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
22481
22482 bool
isFMAFasterThanFMulAndFAdd(EVT VT) const22483 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
22484 if (!Subtarget.hasAnyFMA())
22485 return false;
22486
22487 VT = VT.getScalarType();
22488
22489 if (!VT.isSimple())
22490 return false;
22491
22492 switch (VT.getSimpleVT().SimpleTy) {
22493 case MVT::f32:
22494 case MVT::f64:
22495 return true;
22496 default:
22497 break;
22498 }
22499
22500 return false;
22501 }
22502
isNarrowingProfitable(EVT VT1,EVT VT2) const22503 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
22504 // i16 instructions are longer (0x66 prefix) and potentially slower.
22505 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
22506 }
22507
22508 /// Targets can use this to indicate that they only support *some*
22509 /// VECTOR_SHUFFLE operations, those with specific masks.
22510 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
22511 /// are assumed to be legal.
22512 bool
isShuffleMaskLegal(const SmallVectorImpl<int> & M,EVT VT) const22513 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
22514 EVT VT) const {
22515 if (!VT.isSimple())
22516 return false;
22517
22518 // Not for i1 vectors
22519 if (VT.getSimpleVT().getScalarType() == MVT::i1)
22520 return false;
22521
22522 // Very little shuffling can be done for 64-bit vectors right now.
22523 if (VT.getSimpleVT().getSizeInBits() == 64)
22524 return false;
22525
22526 // We only care that the types being shuffled are legal. The lowering can
22527 // handle any possible shuffle mask that results.
22528 return isTypeLegal(VT.getSimpleVT());
22529 }
22530
22531 bool
isVectorClearMaskLegal(const SmallVectorImpl<int> & Mask,EVT VT) const22532 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
22533 EVT VT) const {
22534 // Just delegate to the generic legality, clear masks aren't special.
22535 return isShuffleMaskLegal(Mask, VT);
22536 }
22537
22538 //===----------------------------------------------------------------------===//
22539 // X86 Scheduler Hooks
22540 //===----------------------------------------------------------------------===//
22541
22542 /// Utility function to emit xbegin specifying the start of an RTM region.
emitXBegin(MachineInstr & MI,MachineBasicBlock * MBB,const TargetInstrInfo * TII)22543 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
22544 const TargetInstrInfo *TII) {
22545 DebugLoc DL = MI.getDebugLoc();
22546
22547 const BasicBlock *BB = MBB->getBasicBlock();
22548 MachineFunction::iterator I = ++MBB->getIterator();
22549
22550 // For the v = xbegin(), we generate
22551 //
22552 // thisMBB:
22553 // xbegin sinkMBB
22554 //
22555 // mainMBB:
22556 // eax = -1
22557 //
22558 // sinkMBB:
22559 // v = eax
22560
22561 MachineBasicBlock *thisMBB = MBB;
22562 MachineFunction *MF = MBB->getParent();
22563 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
22564 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
22565 MF->insert(I, mainMBB);
22566 MF->insert(I, sinkMBB);
22567
22568 // Transfer the remainder of BB and its successor edges to sinkMBB.
22569 sinkMBB->splice(sinkMBB->begin(), MBB,
22570 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
22571 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
22572
22573 // thisMBB:
22574 // xbegin sinkMBB
22575 // # fallthrough to mainMBB
22576 // # abortion to sinkMBB
22577 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
22578 thisMBB->addSuccessor(mainMBB);
22579 thisMBB->addSuccessor(sinkMBB);
22580
22581 // mainMBB:
22582 // EAX = -1
22583 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
22584 mainMBB->addSuccessor(sinkMBB);
22585
22586 // sinkMBB:
22587 // EAX is live into the sinkMBB
22588 sinkMBB->addLiveIn(X86::EAX);
22589 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
22590 MI.getOperand(0).getReg())
22591 .addReg(X86::EAX);
22592
22593 MI.eraseFromParent();
22594 return sinkMBB;
22595 }
22596
22597 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
22598 // or XMM0_V32I8 in AVX all of this code can be replaced with that
22599 // in the .td file.
emitPCMPSTRM(MachineInstr & MI,MachineBasicBlock * BB,const TargetInstrInfo * TII)22600 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
22601 const TargetInstrInfo *TII) {
22602 unsigned Opc;
22603 switch (MI.getOpcode()) {
22604 default: llvm_unreachable("illegal opcode!");
22605 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
22606 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
22607 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
22608 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
22609 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
22610 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
22611 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
22612 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
22613 }
22614
22615 DebugLoc dl = MI.getDebugLoc();
22616 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22617
22618 unsigned NumArgs = MI.getNumOperands();
22619 for (unsigned i = 1; i < NumArgs; ++i) {
22620 MachineOperand &Op = MI.getOperand(i);
22621 if (!(Op.isReg() && Op.isImplicit()))
22622 MIB.addOperand(Op);
22623 }
22624 if (MI.hasOneMemOperand())
22625 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22626
22627 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22628 .addReg(X86::XMM0);
22629
22630 MI.eraseFromParent();
22631 return BB;
22632 }
22633
22634 // FIXME: Custom handling because TableGen doesn't support multiple implicit
22635 // defs in an instruction pattern
emitPCMPSTRI(MachineInstr & MI,MachineBasicBlock * BB,const TargetInstrInfo * TII)22636 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
22637 const TargetInstrInfo *TII) {
22638 unsigned Opc;
22639 switch (MI.getOpcode()) {
22640 default: llvm_unreachable("illegal opcode!");
22641 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
22642 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
22643 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
22644 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
22645 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
22646 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
22647 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
22648 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
22649 }
22650
22651 DebugLoc dl = MI.getDebugLoc();
22652 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22653
22654 unsigned NumArgs = MI.getNumOperands(); // remove the results
22655 for (unsigned i = 1; i < NumArgs; ++i) {
22656 MachineOperand &Op = MI.getOperand(i);
22657 if (!(Op.isReg() && Op.isImplicit()))
22658 MIB.addOperand(Op);
22659 }
22660 if (MI.hasOneMemOperand())
22661 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22662
22663 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22664 .addReg(X86::ECX);
22665
22666 MI.eraseFromParent();
22667 return BB;
22668 }
22669
emitWRPKRU(MachineInstr & MI,MachineBasicBlock * BB,const X86Subtarget & Subtarget)22670 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22671 const X86Subtarget &Subtarget) {
22672 DebugLoc dl = MI.getDebugLoc();
22673 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22674
22675 // insert input VAL into EAX
22676 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
22677 .addReg(MI.getOperand(0).getReg());
22678 // insert zero to ECX
22679 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22680
22681 // insert zero to EDX
22682 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
22683
22684 // insert WRPKRU instruction
22685 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
22686
22687 MI.eraseFromParent(); // The pseudo is gone now.
22688 return BB;
22689 }
22690
emitRDPKRU(MachineInstr & MI,MachineBasicBlock * BB,const X86Subtarget & Subtarget)22691 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22692 const X86Subtarget &Subtarget) {
22693 DebugLoc dl = MI.getDebugLoc();
22694 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22695
22696 // insert zero to ECX
22697 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22698
22699 // insert RDPKRU instruction
22700 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
22701 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22702 .addReg(X86::EAX);
22703
22704 MI.eraseFromParent(); // The pseudo is gone now.
22705 return BB;
22706 }
22707
emitMonitor(MachineInstr & MI,MachineBasicBlock * BB,const X86Subtarget & Subtarget,unsigned Opc)22708 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
22709 const X86Subtarget &Subtarget,
22710 unsigned Opc) {
22711 DebugLoc dl = MI.getDebugLoc();
22712 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22713 // Address into RAX/EAX, other two args into ECX, EDX.
22714 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
22715 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
22716 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
22717 for (int i = 0; i < X86::AddrNumOperands; ++i)
22718 MIB.addOperand(MI.getOperand(i));
22719
22720 unsigned ValOps = X86::AddrNumOperands;
22721 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
22722 .addReg(MI.getOperand(ValOps).getReg());
22723 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
22724 .addReg(MI.getOperand(ValOps + 1).getReg());
22725
22726 // The instruction doesn't actually take any operands though.
22727 BuildMI(*BB, MI, dl, TII->get(Opc));
22728
22729 MI.eraseFromParent(); // The pseudo is gone now.
22730 return BB;
22731 }
22732
22733 MachineBasicBlock *
EmitVAARG64WithCustomInserter(MachineInstr & MI,MachineBasicBlock * MBB) const22734 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
22735 MachineBasicBlock *MBB) const {
22736 // Emit va_arg instruction on X86-64.
22737
22738 // Operands to this pseudo-instruction:
22739 // 0 ) Output : destination address (reg)
22740 // 1-5) Input : va_list address (addr, i64mem)
22741 // 6 ) ArgSize : Size (in bytes) of vararg type
22742 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
22743 // 8 ) Align : Alignment of type
22744 // 9 ) EFLAGS (implicit-def)
22745
22746 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
22747 static_assert(X86::AddrNumOperands == 5,
22748 "VAARG_64 assumes 5 address operands");
22749
22750 unsigned DestReg = MI.getOperand(0).getReg();
22751 MachineOperand &Base = MI.getOperand(1);
22752 MachineOperand &Scale = MI.getOperand(2);
22753 MachineOperand &Index = MI.getOperand(3);
22754 MachineOperand &Disp = MI.getOperand(4);
22755 MachineOperand &Segment = MI.getOperand(5);
22756 unsigned ArgSize = MI.getOperand(6).getImm();
22757 unsigned ArgMode = MI.getOperand(7).getImm();
22758 unsigned Align = MI.getOperand(8).getImm();
22759
22760 // Memory Reference
22761 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
22762 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
22763 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
22764
22765 // Machine Information
22766 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22767 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
22768 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
22769 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
22770 DebugLoc DL = MI.getDebugLoc();
22771
22772 // struct va_list {
22773 // i32 gp_offset
22774 // i32 fp_offset
22775 // i64 overflow_area (address)
22776 // i64 reg_save_area (address)
22777 // }
22778 // sizeof(va_list) = 24
22779 // alignment(va_list) = 8
22780
22781 unsigned TotalNumIntRegs = 6;
22782 unsigned TotalNumXMMRegs = 8;
22783 bool UseGPOffset = (ArgMode == 1);
22784 bool UseFPOffset = (ArgMode == 2);
22785 unsigned MaxOffset = TotalNumIntRegs * 8 +
22786 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
22787
22788 /* Align ArgSize to a multiple of 8 */
22789 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
22790 bool NeedsAlign = (Align > 8);
22791
22792 MachineBasicBlock *thisMBB = MBB;
22793 MachineBasicBlock *overflowMBB;
22794 MachineBasicBlock *offsetMBB;
22795 MachineBasicBlock *endMBB;
22796
22797 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
22798 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
22799 unsigned OffsetReg = 0;
22800
22801 if (!UseGPOffset && !UseFPOffset) {
22802 // If we only pull from the overflow region, we don't create a branch.
22803 // We don't need to alter control flow.
22804 OffsetDestReg = 0; // unused
22805 OverflowDestReg = DestReg;
22806
22807 offsetMBB = nullptr;
22808 overflowMBB = thisMBB;
22809 endMBB = thisMBB;
22810 } else {
22811 // First emit code to check if gp_offset (or fp_offset) is below the bound.
22812 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
22813 // If not, pull from overflow_area. (branch to overflowMBB)
22814 //
22815 // thisMBB
22816 // | .
22817 // | .
22818 // offsetMBB overflowMBB
22819 // | .
22820 // | .
22821 // endMBB
22822
22823 // Registers for the PHI in endMBB
22824 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
22825 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
22826
22827 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
22828 MachineFunction *MF = MBB->getParent();
22829 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22830 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22831 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22832
22833 MachineFunction::iterator MBBIter = ++MBB->getIterator();
22834
22835 // Insert the new basic blocks
22836 MF->insert(MBBIter, offsetMBB);
22837 MF->insert(MBBIter, overflowMBB);
22838 MF->insert(MBBIter, endMBB);
22839
22840 // Transfer the remainder of MBB and its successor edges to endMBB.
22841 endMBB->splice(endMBB->begin(), thisMBB,
22842 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
22843 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
22844
22845 // Make offsetMBB and overflowMBB successors of thisMBB
22846 thisMBB->addSuccessor(offsetMBB);
22847 thisMBB->addSuccessor(overflowMBB);
22848
22849 // endMBB is a successor of both offsetMBB and overflowMBB
22850 offsetMBB->addSuccessor(endMBB);
22851 overflowMBB->addSuccessor(endMBB);
22852
22853 // Load the offset value into a register
22854 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22855 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
22856 .addOperand(Base)
22857 .addOperand(Scale)
22858 .addOperand(Index)
22859 .addDisp(Disp, UseFPOffset ? 4 : 0)
22860 .addOperand(Segment)
22861 .setMemRefs(MMOBegin, MMOEnd);
22862
22863 // Check if there is enough room left to pull this argument.
22864 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
22865 .addReg(OffsetReg)
22866 .addImm(MaxOffset + 8 - ArgSizeA8);
22867
22868 // Branch to "overflowMBB" if offset >= max
22869 // Fall through to "offsetMBB" otherwise
22870 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
22871 .addMBB(overflowMBB);
22872 }
22873
22874 // In offsetMBB, emit code to use the reg_save_area.
22875 if (offsetMBB) {
22876 assert(OffsetReg != 0);
22877
22878 // Read the reg_save_area address.
22879 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
22880 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
22881 .addOperand(Base)
22882 .addOperand(Scale)
22883 .addOperand(Index)
22884 .addDisp(Disp, 16)
22885 .addOperand(Segment)
22886 .setMemRefs(MMOBegin, MMOEnd);
22887
22888 // Zero-extend the offset
22889 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
22890 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
22891 .addImm(0)
22892 .addReg(OffsetReg)
22893 .addImm(X86::sub_32bit);
22894
22895 // Add the offset to the reg_save_area to get the final address.
22896 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
22897 .addReg(OffsetReg64)
22898 .addReg(RegSaveReg);
22899
22900 // Compute the offset for the next argument
22901 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22902 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
22903 .addReg(OffsetReg)
22904 .addImm(UseFPOffset ? 16 : 8);
22905
22906 // Store it back into the va_list.
22907 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
22908 .addOperand(Base)
22909 .addOperand(Scale)
22910 .addOperand(Index)
22911 .addDisp(Disp, UseFPOffset ? 4 : 0)
22912 .addOperand(Segment)
22913 .addReg(NextOffsetReg)
22914 .setMemRefs(MMOBegin, MMOEnd);
22915
22916 // Jump to endMBB
22917 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
22918 .addMBB(endMBB);
22919 }
22920
22921 //
22922 // Emit code to use overflow area
22923 //
22924
22925 // Load the overflow_area address into a register.
22926 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
22927 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
22928 .addOperand(Base)
22929 .addOperand(Scale)
22930 .addOperand(Index)
22931 .addDisp(Disp, 8)
22932 .addOperand(Segment)
22933 .setMemRefs(MMOBegin, MMOEnd);
22934
22935 // If we need to align it, do so. Otherwise, just copy the address
22936 // to OverflowDestReg.
22937 if (NeedsAlign) {
22938 // Align the overflow address
22939 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
22940 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
22941
22942 // aligned_addr = (addr + (align-1)) & ~(align-1)
22943 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
22944 .addReg(OverflowAddrReg)
22945 .addImm(Align-1);
22946
22947 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
22948 .addReg(TmpReg)
22949 .addImm(~(uint64_t)(Align-1));
22950 } else {
22951 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
22952 .addReg(OverflowAddrReg);
22953 }
22954
22955 // Compute the next overflow address after this argument.
22956 // (the overflow address should be kept 8-byte aligned)
22957 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
22958 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
22959 .addReg(OverflowDestReg)
22960 .addImm(ArgSizeA8);
22961
22962 // Store the new overflow address.
22963 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
22964 .addOperand(Base)
22965 .addOperand(Scale)
22966 .addOperand(Index)
22967 .addDisp(Disp, 8)
22968 .addOperand(Segment)
22969 .addReg(NextAddrReg)
22970 .setMemRefs(MMOBegin, MMOEnd);
22971
22972 // If we branched, emit the PHI to the front of endMBB.
22973 if (offsetMBB) {
22974 BuildMI(*endMBB, endMBB->begin(), DL,
22975 TII->get(X86::PHI), DestReg)
22976 .addReg(OffsetDestReg).addMBB(offsetMBB)
22977 .addReg(OverflowDestReg).addMBB(overflowMBB);
22978 }
22979
22980 // Erase the pseudo instruction
22981 MI.eraseFromParent();
22982
22983 return endMBB;
22984 }
22985
EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr & MI,MachineBasicBlock * MBB) const22986 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
22987 MachineInstr &MI, MachineBasicBlock *MBB) const {
22988 // Emit code to save XMM registers to the stack. The ABI says that the
22989 // number of registers to save is given in %al, so it's theoretically
22990 // possible to do an indirect jump trick to avoid saving all of them,
22991 // however this code takes a simpler approach and just executes all
22992 // of the stores if %al is non-zero. It's less code, and it's probably
22993 // easier on the hardware branch predictor, and stores aren't all that
22994 // expensive anyway.
22995
22996 // Create the new basic blocks. One block contains all the XMM stores,
22997 // and one block is the final destination regardless of whether any
22998 // stores were performed.
22999 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
23000 MachineFunction *F = MBB->getParent();
23001 MachineFunction::iterator MBBIter = ++MBB->getIterator();
23002 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
23003 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
23004 F->insert(MBBIter, XMMSaveMBB);
23005 F->insert(MBBIter, EndMBB);
23006
23007 // Transfer the remainder of MBB and its successor edges to EndMBB.
23008 EndMBB->splice(EndMBB->begin(), MBB,
23009 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23010 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
23011
23012 // The original block will now fall through to the XMM save block.
23013 MBB->addSuccessor(XMMSaveMBB);
23014 // The XMMSaveMBB will fall through to the end block.
23015 XMMSaveMBB->addSuccessor(EndMBB);
23016
23017 // Now add the instructions.
23018 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23019 DebugLoc DL = MI.getDebugLoc();
23020
23021 unsigned CountReg = MI.getOperand(0).getReg();
23022 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
23023 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
23024
23025 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
23026 // If %al is 0, branch around the XMM save block.
23027 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
23028 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
23029 MBB->addSuccessor(EndMBB);
23030 }
23031
23032 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
23033 // that was just emitted, but clearly shouldn't be "saved".
23034 assert((MI.getNumOperands() <= 3 ||
23035 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
23036 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
23037 "Expected last argument to be EFLAGS");
23038 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
23039 // In the XMM save block, save all the XMM argument registers.
23040 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
23041 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
23042 MachineMemOperand *MMO = F->getMachineMemOperand(
23043 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
23044 MachineMemOperand::MOStore,
23045 /*Size=*/16, /*Align=*/16);
23046 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
23047 .addFrameIndex(RegSaveFrameIndex)
23048 .addImm(/*Scale=*/1)
23049 .addReg(/*IndexReg=*/0)
23050 .addImm(/*Disp=*/Offset)
23051 .addReg(/*Segment=*/0)
23052 .addReg(MI.getOperand(i).getReg())
23053 .addMemOperand(MMO);
23054 }
23055
23056 MI.eraseFromParent(); // The pseudo instruction is gone now.
23057
23058 return EndMBB;
23059 }
23060
23061 // The EFLAGS operand of SelectItr might be missing a kill marker
23062 // because there were multiple uses of EFLAGS, and ISel didn't know
23063 // which to mark. Figure out whether SelectItr should have had a
23064 // kill marker, and set it if it should. Returns the correct kill
23065 // marker value.
checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,MachineBasicBlock * BB,const TargetRegisterInfo * TRI)23066 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
23067 MachineBasicBlock* BB,
23068 const TargetRegisterInfo* TRI) {
23069 // Scan forward through BB for a use/def of EFLAGS.
23070 MachineBasicBlock::iterator miI(std::next(SelectItr));
23071 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
23072 const MachineInstr& mi = *miI;
23073 if (mi.readsRegister(X86::EFLAGS))
23074 return false;
23075 if (mi.definesRegister(X86::EFLAGS))
23076 break; // Should have kill-flag - update below.
23077 }
23078
23079 // If we hit the end of the block, check whether EFLAGS is live into a
23080 // successor.
23081 if (miI == BB->end()) {
23082 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
23083 sEnd = BB->succ_end();
23084 sItr != sEnd; ++sItr) {
23085 MachineBasicBlock* succ = *sItr;
23086 if (succ->isLiveIn(X86::EFLAGS))
23087 return false;
23088 }
23089 }
23090
23091 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
23092 // out. SelectMI should have a kill flag on EFLAGS.
23093 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
23094 return true;
23095 }
23096
23097 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
23098 // together with other CMOV pseudo-opcodes into a single basic-block with
23099 // conditional jump around it.
isCMOVPseudo(MachineInstr & MI)23100 static bool isCMOVPseudo(MachineInstr &MI) {
23101 switch (MI.getOpcode()) {
23102 case X86::CMOV_FR32:
23103 case X86::CMOV_FR64:
23104 case X86::CMOV_GR8:
23105 case X86::CMOV_GR16:
23106 case X86::CMOV_GR32:
23107 case X86::CMOV_RFP32:
23108 case X86::CMOV_RFP64:
23109 case X86::CMOV_RFP80:
23110 case X86::CMOV_V2F64:
23111 case X86::CMOV_V2I64:
23112 case X86::CMOV_V4F32:
23113 case X86::CMOV_V4F64:
23114 case X86::CMOV_V4I64:
23115 case X86::CMOV_V16F32:
23116 case X86::CMOV_V8F32:
23117 case X86::CMOV_V8F64:
23118 case X86::CMOV_V8I64:
23119 case X86::CMOV_V8I1:
23120 case X86::CMOV_V16I1:
23121 case X86::CMOV_V32I1:
23122 case X86::CMOV_V64I1:
23123 return true;
23124
23125 default:
23126 return false;
23127 }
23128 }
23129
23130 MachineBasicBlock *
EmitLoweredSelect(MachineInstr & MI,MachineBasicBlock * BB) const23131 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
23132 MachineBasicBlock *BB) const {
23133 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23134 DebugLoc DL = MI.getDebugLoc();
23135
23136 // To "insert" a SELECT_CC instruction, we actually have to insert the
23137 // diamond control-flow pattern. The incoming instruction knows the
23138 // destination vreg to set, the condition code register to branch on, the
23139 // true/false values to select between, and a branch opcode to use.
23140 const BasicBlock *LLVM_BB = BB->getBasicBlock();
23141 MachineFunction::iterator It = ++BB->getIterator();
23142
23143 // thisMBB:
23144 // ...
23145 // TrueVal = ...
23146 // cmpTY ccX, r1, r2
23147 // bCC copy1MBB
23148 // fallthrough --> copy0MBB
23149 MachineBasicBlock *thisMBB = BB;
23150 MachineFunction *F = BB->getParent();
23151
23152 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
23153 // as described above, by inserting a BB, and then making a PHI at the join
23154 // point to select the true and false operands of the CMOV in the PHI.
23155 //
23156 // The code also handles two different cases of multiple CMOV opcodes
23157 // in a row.
23158 //
23159 // Case 1:
23160 // In this case, there are multiple CMOVs in a row, all which are based on
23161 // the same condition setting (or the exact opposite condition setting).
23162 // In this case we can lower all the CMOVs using a single inserted BB, and
23163 // then make a number of PHIs at the join point to model the CMOVs. The only
23164 // trickiness here, is that in a case like:
23165 //
23166 // t2 = CMOV cond1 t1, f1
23167 // t3 = CMOV cond1 t2, f2
23168 //
23169 // when rewriting this into PHIs, we have to perform some renaming on the
23170 // temps since you cannot have a PHI operand refer to a PHI result earlier
23171 // in the same block. The "simple" but wrong lowering would be:
23172 //
23173 // t2 = PHI t1(BB1), f1(BB2)
23174 // t3 = PHI t2(BB1), f2(BB2)
23175 //
23176 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
23177 // renaming is to note that on the path through BB1, t2 is really just a
23178 // copy of t1, and do that renaming, properly generating:
23179 //
23180 // t2 = PHI t1(BB1), f1(BB2)
23181 // t3 = PHI t1(BB1), f2(BB2)
23182 //
23183 // Case 2, we lower cascaded CMOVs such as
23184 //
23185 // (CMOV (CMOV F, T, cc1), T, cc2)
23186 //
23187 // to two successives branches. For that, we look for another CMOV as the
23188 // following instruction.
23189 //
23190 // Without this, we would add a PHI between the two jumps, which ends up
23191 // creating a few copies all around. For instance, for
23192 //
23193 // (sitofp (zext (fcmp une)))
23194 //
23195 // we would generate:
23196 //
23197 // ucomiss %xmm1, %xmm0
23198 // movss <1.0f>, %xmm0
23199 // movaps %xmm0, %xmm1
23200 // jne .LBB5_2
23201 // xorps %xmm1, %xmm1
23202 // .LBB5_2:
23203 // jp .LBB5_4
23204 // movaps %xmm1, %xmm0
23205 // .LBB5_4:
23206 // retq
23207 //
23208 // because this custom-inserter would have generated:
23209 //
23210 // A
23211 // | \
23212 // | B
23213 // | /
23214 // C
23215 // | \
23216 // | D
23217 // | /
23218 // E
23219 //
23220 // A: X = ...; Y = ...
23221 // B: empty
23222 // C: Z = PHI [X, A], [Y, B]
23223 // D: empty
23224 // E: PHI [X, C], [Z, D]
23225 //
23226 // If we lower both CMOVs in a single step, we can instead generate:
23227 //
23228 // A
23229 // | \
23230 // | C
23231 // | /|
23232 // |/ |
23233 // | |
23234 // | D
23235 // | /
23236 // E
23237 //
23238 // A: X = ...; Y = ...
23239 // D: empty
23240 // E: PHI [X, A], [X, C], [Y, D]
23241 //
23242 // Which, in our sitofp/fcmp example, gives us something like:
23243 //
23244 // ucomiss %xmm1, %xmm0
23245 // movss <1.0f>, %xmm0
23246 // jne .LBB5_4
23247 // jp .LBB5_4
23248 // xorps %xmm0, %xmm0
23249 // .LBB5_4:
23250 // retq
23251 //
23252 MachineInstr *CascadedCMOV = nullptr;
23253 MachineInstr *LastCMOV = &MI;
23254 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
23255 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
23256 MachineBasicBlock::iterator NextMIIt =
23257 std::next(MachineBasicBlock::iterator(MI));
23258
23259 // Check for case 1, where there are multiple CMOVs with the same condition
23260 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
23261 // number of jumps the most.
23262
23263 if (isCMOVPseudo(MI)) {
23264 // See if we have a string of CMOVS with the same condition.
23265 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
23266 (NextMIIt->getOperand(3).getImm() == CC ||
23267 NextMIIt->getOperand(3).getImm() == OppCC)) {
23268 LastCMOV = &*NextMIIt;
23269 ++NextMIIt;
23270 }
23271 }
23272
23273 // This checks for case 2, but only do this if we didn't already find
23274 // case 1, as indicated by LastCMOV == MI.
23275 if (LastCMOV == &MI && NextMIIt != BB->end() &&
23276 NextMIIt->getOpcode() == MI.getOpcode() &&
23277 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
23278 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
23279 NextMIIt->getOperand(1).isKill()) {
23280 CascadedCMOV = &*NextMIIt;
23281 }
23282
23283 MachineBasicBlock *jcc1MBB = nullptr;
23284
23285 // If we have a cascaded CMOV, we lower it to two successive branches to
23286 // the same block. EFLAGS is used by both, so mark it as live in the second.
23287 if (CascadedCMOV) {
23288 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
23289 F->insert(It, jcc1MBB);
23290 jcc1MBB->addLiveIn(X86::EFLAGS);
23291 }
23292
23293 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
23294 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
23295 F->insert(It, copy0MBB);
23296 F->insert(It, sinkMBB);
23297
23298 // If the EFLAGS register isn't dead in the terminator, then claim that it's
23299 // live into the sink and copy blocks.
23300 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
23301
23302 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
23303 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
23304 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
23305 copy0MBB->addLiveIn(X86::EFLAGS);
23306 sinkMBB->addLiveIn(X86::EFLAGS);
23307 }
23308
23309 // Transfer the remainder of BB and its successor edges to sinkMBB.
23310 sinkMBB->splice(sinkMBB->begin(), BB,
23311 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
23312 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
23313
23314 // Add the true and fallthrough blocks as its successors.
23315 if (CascadedCMOV) {
23316 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
23317 BB->addSuccessor(jcc1MBB);
23318
23319 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
23320 // jump to the sinkMBB.
23321 jcc1MBB->addSuccessor(copy0MBB);
23322 jcc1MBB->addSuccessor(sinkMBB);
23323 } else {
23324 BB->addSuccessor(copy0MBB);
23325 }
23326
23327 // The true block target of the first (or only) branch is always sinkMBB.
23328 BB->addSuccessor(sinkMBB);
23329
23330 // Create the conditional branch instruction.
23331 unsigned Opc = X86::GetCondBranchFromCond(CC);
23332 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
23333
23334 if (CascadedCMOV) {
23335 unsigned Opc2 = X86::GetCondBranchFromCond(
23336 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
23337 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
23338 }
23339
23340 // copy0MBB:
23341 // %FalseValue = ...
23342 // # fallthrough to sinkMBB
23343 copy0MBB->addSuccessor(sinkMBB);
23344
23345 // sinkMBB:
23346 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
23347 // ...
23348 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
23349 MachineBasicBlock::iterator MIItEnd =
23350 std::next(MachineBasicBlock::iterator(LastCMOV));
23351 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
23352 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
23353 MachineInstrBuilder MIB;
23354
23355 // As we are creating the PHIs, we have to be careful if there is more than
23356 // one. Later CMOVs may reference the results of earlier CMOVs, but later
23357 // PHIs have to reference the individual true/false inputs from earlier PHIs.
23358 // That also means that PHI construction must work forward from earlier to
23359 // later, and that the code must maintain a mapping from earlier PHI's
23360 // destination registers, and the registers that went into the PHI.
23361
23362 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
23363 unsigned DestReg = MIIt->getOperand(0).getReg();
23364 unsigned Op1Reg = MIIt->getOperand(1).getReg();
23365 unsigned Op2Reg = MIIt->getOperand(2).getReg();
23366
23367 // If this CMOV we are generating is the opposite condition from
23368 // the jump we generated, then we have to swap the operands for the
23369 // PHI that is going to be generated.
23370 if (MIIt->getOperand(3).getImm() == OppCC)
23371 std::swap(Op1Reg, Op2Reg);
23372
23373 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
23374 Op1Reg = RegRewriteTable[Op1Reg].first;
23375
23376 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
23377 Op2Reg = RegRewriteTable[Op2Reg].second;
23378
23379 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
23380 TII->get(X86::PHI), DestReg)
23381 .addReg(Op1Reg).addMBB(copy0MBB)
23382 .addReg(Op2Reg).addMBB(thisMBB);
23383
23384 // Add this PHI to the rewrite table.
23385 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
23386 }
23387
23388 // If we have a cascaded CMOV, the second Jcc provides the same incoming
23389 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
23390 if (CascadedCMOV) {
23391 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
23392 // Copy the PHI result to the register defined by the second CMOV.
23393 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
23394 DL, TII->get(TargetOpcode::COPY),
23395 CascadedCMOV->getOperand(0).getReg())
23396 .addReg(MI.getOperand(0).getReg());
23397 CascadedCMOV->eraseFromParent();
23398 }
23399
23400 // Now remove the CMOV(s).
23401 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
23402 (MIIt++)->eraseFromParent();
23403
23404 return sinkMBB;
23405 }
23406
23407 MachineBasicBlock *
EmitLoweredAtomicFP(MachineInstr & MI,MachineBasicBlock * BB) const23408 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
23409 MachineBasicBlock *BB) const {
23410 // Combine the following atomic floating-point modification pattern:
23411 // a.store(reg OP a.load(acquire), release)
23412 // Transform them into:
23413 // OPss (%gpr), %xmm
23414 // movss %xmm, (%gpr)
23415 // Or sd equivalent for 64-bit operations.
23416 unsigned MOp, FOp;
23417 switch (MI.getOpcode()) {
23418 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
23419 case X86::RELEASE_FADD32mr:
23420 FOp = X86::ADDSSrm;
23421 MOp = X86::MOVSSmr;
23422 break;
23423 case X86::RELEASE_FADD64mr:
23424 FOp = X86::ADDSDrm;
23425 MOp = X86::MOVSDmr;
23426 break;
23427 }
23428 const X86InstrInfo *TII = Subtarget.getInstrInfo();
23429 DebugLoc DL = MI.getDebugLoc();
23430 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
23431 unsigned ValOpIdx = X86::AddrNumOperands;
23432 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
23433 MachineInstrBuilder MIB =
23434 BuildMI(*BB, MI, DL, TII->get(FOp),
23435 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
23436 .addReg(VSrc);
23437 for (int i = 0; i < X86::AddrNumOperands; ++i) {
23438 MachineOperand &Operand = MI.getOperand(i);
23439 // Clear any kill flags on register operands as we'll create a second
23440 // instruction using the same address operands.
23441 if (Operand.isReg())
23442 Operand.setIsKill(false);
23443 MIB.addOperand(Operand);
23444 }
23445 MachineInstr *FOpMI = MIB;
23446 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
23447 for (int i = 0; i < X86::AddrNumOperands; ++i)
23448 MIB.addOperand(MI.getOperand(i));
23449 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
23450 MI.eraseFromParent(); // The pseudo instruction is gone now.
23451 return BB;
23452 }
23453
23454 MachineBasicBlock *
EmitLoweredSegAlloca(MachineInstr & MI,MachineBasicBlock * BB) const23455 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
23456 MachineBasicBlock *BB) const {
23457 MachineFunction *MF = BB->getParent();
23458 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23459 DebugLoc DL = MI.getDebugLoc();
23460 const BasicBlock *LLVM_BB = BB->getBasicBlock();
23461
23462 assert(MF->shouldSplitStack());
23463
23464 const bool Is64Bit = Subtarget.is64Bit();
23465 const bool IsLP64 = Subtarget.isTarget64BitLP64();
23466
23467 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
23468 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
23469
23470 // BB:
23471 // ... [Till the alloca]
23472 // If stacklet is not large enough, jump to mallocMBB
23473 //
23474 // bumpMBB:
23475 // Allocate by subtracting from RSP
23476 // Jump to continueMBB
23477 //
23478 // mallocMBB:
23479 // Allocate by call to runtime
23480 //
23481 // continueMBB:
23482 // ...
23483 // [rest of original BB]
23484 //
23485
23486 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23487 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23488 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23489
23490 MachineRegisterInfo &MRI = MF->getRegInfo();
23491 const TargetRegisterClass *AddrRegClass =
23492 getRegClassFor(getPointerTy(MF->getDataLayout()));
23493
23494 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23495 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23496 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
23497 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
23498 sizeVReg = MI.getOperand(1).getReg(),
23499 physSPReg =
23500 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
23501
23502 MachineFunction::iterator MBBIter = ++BB->getIterator();
23503
23504 MF->insert(MBBIter, bumpMBB);
23505 MF->insert(MBBIter, mallocMBB);
23506 MF->insert(MBBIter, continueMBB);
23507
23508 continueMBB->splice(continueMBB->begin(), BB,
23509 std::next(MachineBasicBlock::iterator(MI)), BB->end());
23510 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
23511
23512 // Add code to the main basic block to check if the stack limit has been hit,
23513 // and if so, jump to mallocMBB otherwise to bumpMBB.
23514 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
23515 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
23516 .addReg(tmpSPVReg).addReg(sizeVReg);
23517 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
23518 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
23519 .addReg(SPLimitVReg);
23520 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
23521
23522 // bumpMBB simply decreases the stack pointer, since we know the current
23523 // stacklet has enough space.
23524 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
23525 .addReg(SPLimitVReg);
23526 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
23527 .addReg(SPLimitVReg);
23528 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23529
23530 // Calls into a routine in libgcc to allocate more space from the heap.
23531 const uint32_t *RegMask =
23532 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
23533 if (IsLP64) {
23534 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
23535 .addReg(sizeVReg);
23536 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23537 .addExternalSymbol("__morestack_allocate_stack_space")
23538 .addRegMask(RegMask)
23539 .addReg(X86::RDI, RegState::Implicit)
23540 .addReg(X86::RAX, RegState::ImplicitDefine);
23541 } else if (Is64Bit) {
23542 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
23543 .addReg(sizeVReg);
23544 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23545 .addExternalSymbol("__morestack_allocate_stack_space")
23546 .addRegMask(RegMask)
23547 .addReg(X86::EDI, RegState::Implicit)
23548 .addReg(X86::EAX, RegState::ImplicitDefine);
23549 } else {
23550 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
23551 .addImm(12);
23552 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
23553 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
23554 .addExternalSymbol("__morestack_allocate_stack_space")
23555 .addRegMask(RegMask)
23556 .addReg(X86::EAX, RegState::ImplicitDefine);
23557 }
23558
23559 if (!Is64Bit)
23560 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
23561 .addImm(16);
23562
23563 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
23564 .addReg(IsLP64 ? X86::RAX : X86::EAX);
23565 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23566
23567 // Set up the CFG correctly.
23568 BB->addSuccessor(bumpMBB);
23569 BB->addSuccessor(mallocMBB);
23570 mallocMBB->addSuccessor(continueMBB);
23571 bumpMBB->addSuccessor(continueMBB);
23572
23573 // Take care of the PHI nodes.
23574 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
23575 MI.getOperand(0).getReg())
23576 .addReg(mallocPtrVReg)
23577 .addMBB(mallocMBB)
23578 .addReg(bumpSPPtrVReg)
23579 .addMBB(bumpMBB);
23580
23581 // Delete the original pseudo instruction.
23582 MI.eraseFromParent();
23583
23584 // And we're done.
23585 return continueMBB;
23586 }
23587
23588 MachineBasicBlock *
EmitLoweredCatchRet(MachineInstr & MI,MachineBasicBlock * BB) const23589 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
23590 MachineBasicBlock *BB) const {
23591 MachineFunction *MF = BB->getParent();
23592 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23593 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
23594 DebugLoc DL = MI.getDebugLoc();
23595
23596 assert(!isAsynchronousEHPersonality(
23597 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
23598 "SEH does not use catchret!");
23599
23600 // Only 32-bit EH needs to worry about manually restoring stack pointers.
23601 if (!Subtarget.is32Bit())
23602 return BB;
23603
23604 // C++ EH creates a new target block to hold the restore code, and wires up
23605 // the new block to the return destination with a normal JMP_4.
23606 MachineBasicBlock *RestoreMBB =
23607 MF->CreateMachineBasicBlock(BB->getBasicBlock());
23608 assert(BB->succ_size() == 1);
23609 MF->insert(std::next(BB->getIterator()), RestoreMBB);
23610 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
23611 BB->addSuccessor(RestoreMBB);
23612 MI.getOperand(0).setMBB(RestoreMBB);
23613
23614 auto RestoreMBBI = RestoreMBB->begin();
23615 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
23616 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
23617 return BB;
23618 }
23619
23620 MachineBasicBlock *
EmitLoweredCatchPad(MachineInstr & MI,MachineBasicBlock * BB) const23621 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
23622 MachineBasicBlock *BB) const {
23623 MachineFunction *MF = BB->getParent();
23624 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
23625 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
23626 // Only 32-bit SEH requires special handling for catchpad.
23627 if (IsSEH && Subtarget.is32Bit()) {
23628 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23629 DebugLoc DL = MI.getDebugLoc();
23630 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
23631 }
23632 MI.eraseFromParent();
23633 return BB;
23634 }
23635
23636 MachineBasicBlock *
EmitLoweredTLSAddr(MachineInstr & MI,MachineBasicBlock * BB) const23637 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
23638 MachineBasicBlock *BB) const {
23639 // So, here we replace TLSADDR with the sequence:
23640 // adjust_stackdown -> TLSADDR -> adjust_stackup.
23641 // We need this because TLSADDR is lowered into calls
23642 // inside MC, therefore without the two markers shrink-wrapping
23643 // may push the prologue/epilogue pass them.
23644 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23645 DebugLoc DL = MI.getDebugLoc();
23646 MachineFunction &MF = *BB->getParent();
23647
23648 // Emit CALLSEQ_START right before the instruction.
23649 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
23650 MachineInstrBuilder CallseqStart =
23651 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
23652 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
23653
23654 // Emit CALLSEQ_END right after the instruction.
23655 // We don't call erase from parent because we want to keep the
23656 // original instruction around.
23657 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
23658 MachineInstrBuilder CallseqEnd =
23659 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
23660 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
23661
23662 return BB;
23663 }
23664
23665 MachineBasicBlock *
EmitLoweredTLSCall(MachineInstr & MI,MachineBasicBlock * BB) const23666 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
23667 MachineBasicBlock *BB) const {
23668 // This is pretty easy. We're taking the value that we received from
23669 // our load from the relocation, sticking it in either RDI (x86-64)
23670 // or EAX and doing an indirect call. The return value will then
23671 // be in the normal return register.
23672 MachineFunction *F = BB->getParent();
23673 const X86InstrInfo *TII = Subtarget.getInstrInfo();
23674 DebugLoc DL = MI.getDebugLoc();
23675
23676 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
23677 assert(MI.getOperand(3).isGlobal() && "This should be a global");
23678
23679 // Get a register mask for the lowered call.
23680 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
23681 // proper register mask.
23682 const uint32_t *RegMask =
23683 Subtarget.is64Bit() ?
23684 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
23685 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
23686 if (Subtarget.is64Bit()) {
23687 MachineInstrBuilder MIB =
23688 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
23689 .addReg(X86::RIP)
23690 .addImm(0)
23691 .addReg(0)
23692 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23693 MI.getOperand(3).getTargetFlags())
23694 .addReg(0);
23695 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
23696 addDirectMem(MIB, X86::RDI);
23697 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
23698 } else if (!isPositionIndependent()) {
23699 MachineInstrBuilder MIB =
23700 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23701 .addReg(0)
23702 .addImm(0)
23703 .addReg(0)
23704 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23705 MI.getOperand(3).getTargetFlags())
23706 .addReg(0);
23707 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23708 addDirectMem(MIB, X86::EAX);
23709 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23710 } else {
23711 MachineInstrBuilder MIB =
23712 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23713 .addReg(TII->getGlobalBaseReg(F))
23714 .addImm(0)
23715 .addReg(0)
23716 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23717 MI.getOperand(3).getTargetFlags())
23718 .addReg(0);
23719 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23720 addDirectMem(MIB, X86::EAX);
23721 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23722 }
23723
23724 MI.eraseFromParent(); // The pseudo instruction is gone now.
23725 return BB;
23726 }
23727
23728 MachineBasicBlock *
emitEHSjLjSetJmp(MachineInstr & MI,MachineBasicBlock * MBB) const23729 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
23730 MachineBasicBlock *MBB) const {
23731 DebugLoc DL = MI.getDebugLoc();
23732 MachineFunction *MF = MBB->getParent();
23733 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23734 MachineRegisterInfo &MRI = MF->getRegInfo();
23735
23736 const BasicBlock *BB = MBB->getBasicBlock();
23737 MachineFunction::iterator I = ++MBB->getIterator();
23738
23739 // Memory Reference
23740 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23741 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23742
23743 unsigned DstReg;
23744 unsigned MemOpndSlot = 0;
23745
23746 unsigned CurOp = 0;
23747
23748 DstReg = MI.getOperand(CurOp++).getReg();
23749 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
23750 assert(RC->hasType(MVT::i32) && "Invalid destination!");
23751 unsigned mainDstReg = MRI.createVirtualRegister(RC);
23752 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
23753
23754 MemOpndSlot = CurOp;
23755
23756 MVT PVT = getPointerTy(MF->getDataLayout());
23757 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23758 "Invalid Pointer Size!");
23759
23760 // For v = setjmp(buf), we generate
23761 //
23762 // thisMBB:
23763 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
23764 // SjLjSetup restoreMBB
23765 //
23766 // mainMBB:
23767 // v_main = 0
23768 //
23769 // sinkMBB:
23770 // v = phi(main, restore)
23771 //
23772 // restoreMBB:
23773 // if base pointer being used, load it from frame
23774 // v_restore = 1
23775
23776 MachineBasicBlock *thisMBB = MBB;
23777 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23778 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23779 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
23780 MF->insert(I, mainMBB);
23781 MF->insert(I, sinkMBB);
23782 MF->push_back(restoreMBB);
23783 restoreMBB->setHasAddressTaken();
23784
23785 MachineInstrBuilder MIB;
23786
23787 // Transfer the remainder of BB and its successor edges to sinkMBB.
23788 sinkMBB->splice(sinkMBB->begin(), MBB,
23789 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23790 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23791
23792 // thisMBB:
23793 unsigned PtrStoreOpc = 0;
23794 unsigned LabelReg = 0;
23795 const int64_t LabelOffset = 1 * PVT.getStoreSize();
23796 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23797 !isPositionIndependent();
23798
23799 // Prepare IP either in reg or imm.
23800 if (!UseImmLabel) {
23801 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23802 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
23803 LabelReg = MRI.createVirtualRegister(PtrRC);
23804 if (Subtarget.is64Bit()) {
23805 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
23806 .addReg(X86::RIP)
23807 .addImm(0)
23808 .addReg(0)
23809 .addMBB(restoreMBB)
23810 .addReg(0);
23811 } else {
23812 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
23813 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
23814 .addReg(XII->getGlobalBaseReg(MF))
23815 .addImm(0)
23816 .addReg(0)
23817 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
23818 .addReg(0);
23819 }
23820 } else
23821 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23822 // Store IP
23823 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
23824 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23825 if (i == X86::AddrDisp)
23826 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
23827 else
23828 MIB.addOperand(MI.getOperand(MemOpndSlot + i));
23829 }
23830 if (!UseImmLabel)
23831 MIB.addReg(LabelReg);
23832 else
23833 MIB.addMBB(restoreMBB);
23834 MIB.setMemRefs(MMOBegin, MMOEnd);
23835 // Setup
23836 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
23837 .addMBB(restoreMBB);
23838
23839 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23840 MIB.addRegMask(RegInfo->getNoPreservedMask());
23841 thisMBB->addSuccessor(mainMBB);
23842 thisMBB->addSuccessor(restoreMBB);
23843
23844 // mainMBB:
23845 // EAX = 0
23846 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
23847 mainMBB->addSuccessor(sinkMBB);
23848
23849 // sinkMBB:
23850 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
23851 TII->get(X86::PHI), DstReg)
23852 .addReg(mainDstReg).addMBB(mainMBB)
23853 .addReg(restoreDstReg).addMBB(restoreMBB);
23854
23855 // restoreMBB:
23856 if (RegInfo->hasBasePointer(*MF)) {
23857 const bool Uses64BitFramePtr =
23858 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
23859 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
23860 X86FI->setRestoreBasePointer(MF);
23861 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
23862 unsigned BasePtr = RegInfo->getBaseRegister();
23863 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
23864 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
23865 FramePtr, true, X86FI->getRestoreBasePointerOffset())
23866 .setMIFlag(MachineInstr::FrameSetup);
23867 }
23868 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
23869 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
23870 restoreMBB->addSuccessor(sinkMBB);
23871
23872 MI.eraseFromParent();
23873 return sinkMBB;
23874 }
23875
23876 MachineBasicBlock *
emitEHSjLjLongJmp(MachineInstr & MI,MachineBasicBlock * MBB) const23877 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
23878 MachineBasicBlock *MBB) const {
23879 DebugLoc DL = MI.getDebugLoc();
23880 MachineFunction *MF = MBB->getParent();
23881 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23882 MachineRegisterInfo &MRI = MF->getRegInfo();
23883
23884 // Memory Reference
23885 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23886 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23887
23888 MVT PVT = getPointerTy(MF->getDataLayout());
23889 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23890 "Invalid Pointer Size!");
23891
23892 const TargetRegisterClass *RC =
23893 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23894 unsigned Tmp = MRI.createVirtualRegister(RC);
23895 // Since FP is only updated here but NOT referenced, it's treated as GPR.
23896 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23897 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
23898 unsigned SP = RegInfo->getStackRegister();
23899
23900 MachineInstrBuilder MIB;
23901
23902 const int64_t LabelOffset = 1 * PVT.getStoreSize();
23903 const int64_t SPOffset = 2 * PVT.getStoreSize();
23904
23905 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
23906 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
23907
23908 // Reload FP
23909 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
23910 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
23911 MIB.addOperand(MI.getOperand(i));
23912 MIB.setMemRefs(MMOBegin, MMOEnd);
23913 // Reload IP
23914 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
23915 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23916 if (i == X86::AddrDisp)
23917 MIB.addDisp(MI.getOperand(i), LabelOffset);
23918 else
23919 MIB.addOperand(MI.getOperand(i));
23920 }
23921 MIB.setMemRefs(MMOBegin, MMOEnd);
23922 // Reload SP
23923 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
23924 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23925 if (i == X86::AddrDisp)
23926 MIB.addDisp(MI.getOperand(i), SPOffset);
23927 else
23928 MIB.addOperand(MI.getOperand(i));
23929 }
23930 MIB.setMemRefs(MMOBegin, MMOEnd);
23931 // Jump
23932 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
23933
23934 MI.eraseFromParent();
23935 return MBB;
23936 }
23937
SetupEntryBlockForSjLj(MachineInstr & MI,MachineBasicBlock * MBB,MachineBasicBlock * DispatchBB,int FI) const23938 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
23939 MachineBasicBlock *MBB,
23940 MachineBasicBlock *DispatchBB,
23941 int FI) const {
23942 DebugLoc DL = MI.getDebugLoc();
23943 MachineFunction *MF = MBB->getParent();
23944 MachineRegisterInfo *MRI = &MF->getRegInfo();
23945 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23946
23947 MVT PVT = getPointerTy(MF->getDataLayout());
23948 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
23949
23950 unsigned Op = 0;
23951 unsigned VR = 0;
23952
23953 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23954 !isPositionIndependent();
23955
23956 if (UseImmLabel) {
23957 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23958 } else {
23959 const TargetRegisterClass *TRC =
23960 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23961 VR = MRI->createVirtualRegister(TRC);
23962 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23963
23964 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
23965
23966 if (Subtarget.is64Bit())
23967 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
23968 .addReg(X86::RIP)
23969 .addImm(1)
23970 .addReg(0)
23971 .addMBB(DispatchBB)
23972 .addReg(0);
23973 else
23974 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
23975 .addReg(0) /* XII->getGlobalBaseReg(MF) */
23976 .addImm(1)
23977 .addReg(0)
23978 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
23979 .addReg(0);
23980 }
23981
23982 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
23983 addFrameReference(MIB, FI, 36);
23984 if (UseImmLabel)
23985 MIB.addMBB(DispatchBB);
23986 else
23987 MIB.addReg(VR);
23988 }
23989
23990 MachineBasicBlock *
EmitSjLjDispatchBlock(MachineInstr & MI,MachineBasicBlock * BB) const23991 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
23992 MachineBasicBlock *BB) const {
23993 DebugLoc DL = MI.getDebugLoc();
23994 MachineFunction *MF = BB->getParent();
23995 MachineModuleInfo *MMI = &MF->getMMI();
23996 MachineFrameInfo *MFI = MF->getFrameInfo();
23997 MachineRegisterInfo *MRI = &MF->getRegInfo();
23998 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23999 int FI = MFI->getFunctionContextIndex();
24000
24001 // Get a mapping of the call site numbers to all of the landing pads they're
24002 // associated with.
24003 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
24004 unsigned MaxCSNum = 0;
24005 for (auto &MBB : *MF) {
24006 if (!MBB.isEHPad())
24007 continue;
24008
24009 MCSymbol *Sym = nullptr;
24010 for (const auto &MI : MBB) {
24011 if (MI.isDebugValue())
24012 continue;
24013
24014 assert(MI.isEHLabel() && "expected EH_LABEL");
24015 Sym = MI.getOperand(0).getMCSymbol();
24016 break;
24017 }
24018
24019 if (!MMI->hasCallSiteLandingPad(Sym))
24020 continue;
24021
24022 for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
24023 CallSiteNumToLPad[CSI].push_back(&MBB);
24024 MaxCSNum = std::max(MaxCSNum, CSI);
24025 }
24026 }
24027
24028 // Get an ordered list of the machine basic blocks for the jump table.
24029 std::vector<MachineBasicBlock *> LPadList;
24030 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
24031 LPadList.reserve(CallSiteNumToLPad.size());
24032
24033 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
24034 for (auto &LP : CallSiteNumToLPad[CSI]) {
24035 LPadList.push_back(LP);
24036 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
24037 }
24038 }
24039
24040 assert(!LPadList.empty() &&
24041 "No landing pad destinations for the dispatch jump table!");
24042
24043 // Create the MBBs for the dispatch code.
24044
24045 // Shove the dispatch's address into the return slot in the function context.
24046 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
24047 DispatchBB->setIsEHPad(true);
24048
24049 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
24050 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
24051 DispatchBB->addSuccessor(TrapBB);
24052
24053 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
24054 DispatchBB->addSuccessor(DispContBB);
24055
24056 // Insert MBBs.
24057 MF->push_back(DispatchBB);
24058 MF->push_back(DispContBB);
24059 MF->push_back(TrapBB);
24060
24061 // Insert code into the entry block that creates and registers the function
24062 // context.
24063 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
24064
24065 // Create the jump table and associated information
24066 MachineJumpTableInfo *JTI =
24067 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
24068 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
24069
24070 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
24071 const X86RegisterInfo &RI = XII->getRegisterInfo();
24072
24073 // Add a register mask with no preserved registers. This results in all
24074 // registers being marked as clobbered.
24075 if (RI.hasBasePointer(*MF)) {
24076 const bool FPIs64Bit =
24077 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
24078 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
24079 MFI->setRestoreBasePointer(MF);
24080
24081 unsigned FP = RI.getFrameRegister(*MF);
24082 unsigned BP = RI.getBaseRegister();
24083 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
24084 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
24085 MFI->getRestoreBasePointerOffset())
24086 .addRegMask(RI.getNoPreservedMask());
24087 } else {
24088 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
24089 .addRegMask(RI.getNoPreservedMask());
24090 }
24091
24092 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24093 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
24094 4);
24095 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
24096 .addReg(IReg)
24097 .addImm(LPadList.size());
24098 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
24099
24100 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24101 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
24102 .addReg(IReg)
24103 .addImm(1);
24104 BuildMI(DispContBB, DL,
24105 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
24106 .addReg(0)
24107 .addImm(Subtarget.is64Bit() ? 8 : 4)
24108 .addReg(JReg)
24109 .addJumpTableIndex(MJTI)
24110 .addReg(0);
24111
24112 // Add the jump table entries as successors to the MBB.
24113 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
24114 for (auto &LP : LPadList)
24115 if (SeenMBBs.insert(LP).second)
24116 DispContBB->addSuccessor(LP);
24117
24118 // N.B. the order the invoke BBs are processed in doesn't matter here.
24119 SmallVector<MachineBasicBlock *, 64> MBBLPads;
24120 const MCPhysReg *SavedRegs =
24121 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
24122 for (MachineBasicBlock *MBB : InvokeBBs) {
24123 // Remove the landing pad successor from the invoke block and replace it
24124 // with the new dispatch block.
24125 // Keep a copy of Successors since it's modified inside the loop.
24126 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
24127 MBB->succ_rend());
24128 // FIXME: Avoid quadratic complexity.
24129 for (auto MBBS : Successors) {
24130 if (MBBS->isEHPad()) {
24131 MBB->removeSuccessor(MBBS);
24132 MBBLPads.push_back(MBBS);
24133 }
24134 }
24135
24136 MBB->addSuccessor(DispatchBB);
24137
24138 // Find the invoke call and mark all of the callee-saved registers as
24139 // 'implicit defined' so that they're spilled. This prevents code from
24140 // moving instructions to before the EH block, where they will never be
24141 // executed.
24142 for (auto &II : reverse(*MBB)) {
24143 if (!II.isCall())
24144 continue;
24145
24146 DenseMap<unsigned, bool> DefRegs;
24147 for (auto &MOp : II.operands())
24148 if (MOp.isReg())
24149 DefRegs[MOp.getReg()] = true;
24150
24151 MachineInstrBuilder MIB(*MF, &II);
24152 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
24153 unsigned Reg = SavedRegs[RI];
24154 if (!DefRegs[Reg])
24155 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
24156 }
24157
24158 break;
24159 }
24160 }
24161
24162 // Mark all former landing pads as non-landing pads. The dispatch is the only
24163 // landing pad now.
24164 for (auto &LP : MBBLPads)
24165 LP->setIsEHPad(false);
24166
24167 // The instruction is gone now.
24168 MI.eraseFromParent();
24169 return BB;
24170 }
24171
24172 // Replace 213-type (isel default) FMA3 instructions with 231-type for
24173 // accumulator loops. Writing back to the accumulator allows the coalescer
24174 // to remove extra copies in the loop.
24175 // FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937).
24176 MachineBasicBlock *
emitFMA3Instr(MachineInstr & MI,MachineBasicBlock * MBB) const24177 X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
24178 MachineBasicBlock *MBB) const {
24179 MachineOperand &AddendOp = MI.getOperand(3);
24180
24181 // Bail out early if the addend isn't a register - we can't switch these.
24182 if (!AddendOp.isReg())
24183 return MBB;
24184
24185 MachineFunction &MF = *MBB->getParent();
24186 MachineRegisterInfo &MRI = MF.getRegInfo();
24187
24188 // Check whether the addend is defined by a PHI:
24189 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
24190 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
24191 if (!AddendDef.isPHI())
24192 return MBB;
24193
24194 // Look for the following pattern:
24195 // loop:
24196 // %addend = phi [%entry, 0], [%loop, %result]
24197 // ...
24198 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
24199
24200 // Replace with:
24201 // loop:
24202 // %addend = phi [%entry, 0], [%loop, %result]
24203 // ...
24204 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
24205
24206 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
24207 assert(AddendDef.getOperand(i).isReg());
24208 MachineOperand PHISrcOp = AddendDef.getOperand(i);
24209 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
24210 if (&PHISrcInst == &MI) {
24211 // Found a matching instruction.
24212 unsigned NewFMAOpc = 0;
24213 switch (MI.getOpcode()) {
24214 case X86::VFMADDPDr213r:
24215 NewFMAOpc = X86::VFMADDPDr231r;
24216 break;
24217 case X86::VFMADDPSr213r:
24218 NewFMAOpc = X86::VFMADDPSr231r;
24219 break;
24220 case X86::VFMADDSDr213r:
24221 NewFMAOpc = X86::VFMADDSDr231r;
24222 break;
24223 case X86::VFMADDSSr213r:
24224 NewFMAOpc = X86::VFMADDSSr231r;
24225 break;
24226 case X86::VFMSUBPDr213r:
24227 NewFMAOpc = X86::VFMSUBPDr231r;
24228 break;
24229 case X86::VFMSUBPSr213r:
24230 NewFMAOpc = X86::VFMSUBPSr231r;
24231 break;
24232 case X86::VFMSUBSDr213r:
24233 NewFMAOpc = X86::VFMSUBSDr231r;
24234 break;
24235 case X86::VFMSUBSSr213r:
24236 NewFMAOpc = X86::VFMSUBSSr231r;
24237 break;
24238 case X86::VFNMADDPDr213r:
24239 NewFMAOpc = X86::VFNMADDPDr231r;
24240 break;
24241 case X86::VFNMADDPSr213r:
24242 NewFMAOpc = X86::VFNMADDPSr231r;
24243 break;
24244 case X86::VFNMADDSDr213r:
24245 NewFMAOpc = X86::VFNMADDSDr231r;
24246 break;
24247 case X86::VFNMADDSSr213r:
24248 NewFMAOpc = X86::VFNMADDSSr231r;
24249 break;
24250 case X86::VFNMSUBPDr213r:
24251 NewFMAOpc = X86::VFNMSUBPDr231r;
24252 break;
24253 case X86::VFNMSUBPSr213r:
24254 NewFMAOpc = X86::VFNMSUBPSr231r;
24255 break;
24256 case X86::VFNMSUBSDr213r:
24257 NewFMAOpc = X86::VFNMSUBSDr231r;
24258 break;
24259 case X86::VFNMSUBSSr213r:
24260 NewFMAOpc = X86::VFNMSUBSSr231r;
24261 break;
24262 case X86::VFMADDSUBPDr213r:
24263 NewFMAOpc = X86::VFMADDSUBPDr231r;
24264 break;
24265 case X86::VFMADDSUBPSr213r:
24266 NewFMAOpc = X86::VFMADDSUBPSr231r;
24267 break;
24268 case X86::VFMSUBADDPDr213r:
24269 NewFMAOpc = X86::VFMSUBADDPDr231r;
24270 break;
24271 case X86::VFMSUBADDPSr213r:
24272 NewFMAOpc = X86::VFMSUBADDPSr231r;
24273 break;
24274
24275 case X86::VFMADDPDr213rY:
24276 NewFMAOpc = X86::VFMADDPDr231rY;
24277 break;
24278 case X86::VFMADDPSr213rY:
24279 NewFMAOpc = X86::VFMADDPSr231rY;
24280 break;
24281 case X86::VFMSUBPDr213rY:
24282 NewFMAOpc = X86::VFMSUBPDr231rY;
24283 break;
24284 case X86::VFMSUBPSr213rY:
24285 NewFMAOpc = X86::VFMSUBPSr231rY;
24286 break;
24287 case X86::VFNMADDPDr213rY:
24288 NewFMAOpc = X86::VFNMADDPDr231rY;
24289 break;
24290 case X86::VFNMADDPSr213rY:
24291 NewFMAOpc = X86::VFNMADDPSr231rY;
24292 break;
24293 case X86::VFNMSUBPDr213rY:
24294 NewFMAOpc = X86::VFNMSUBPDr231rY;
24295 break;
24296 case X86::VFNMSUBPSr213rY:
24297 NewFMAOpc = X86::VFNMSUBPSr231rY;
24298 break;
24299 case X86::VFMADDSUBPDr213rY:
24300 NewFMAOpc = X86::VFMADDSUBPDr231rY;
24301 break;
24302 case X86::VFMADDSUBPSr213rY:
24303 NewFMAOpc = X86::VFMADDSUBPSr231rY;
24304 break;
24305 case X86::VFMSUBADDPDr213rY:
24306 NewFMAOpc = X86::VFMSUBADDPDr231rY;
24307 break;
24308 case X86::VFMSUBADDPSr213rY:
24309 NewFMAOpc = X86::VFMSUBADDPSr231rY;
24310 break;
24311 default:
24312 llvm_unreachable("Unrecognized FMA variant.");
24313 }
24314
24315 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
24316 MachineInstrBuilder MIB =
24317 BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
24318 .addOperand(MI.getOperand(0))
24319 .addOperand(MI.getOperand(3))
24320 .addOperand(MI.getOperand(2))
24321 .addOperand(MI.getOperand(1));
24322 MBB->insert(MachineBasicBlock::iterator(MI), MIB);
24323 MI.eraseFromParent();
24324 }
24325 }
24326
24327 return MBB;
24328 }
24329
24330 MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const24331 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
24332 MachineBasicBlock *BB) const {
24333 switch (MI.getOpcode()) {
24334 default: llvm_unreachable("Unexpected instr type to insert");
24335 case X86::TAILJMPd64:
24336 case X86::TAILJMPr64:
24337 case X86::TAILJMPm64:
24338 case X86::TAILJMPd64_REX:
24339 case X86::TAILJMPr64_REX:
24340 case X86::TAILJMPm64_REX:
24341 llvm_unreachable("TAILJMP64 would not be touched here.");
24342 case X86::TCRETURNdi64:
24343 case X86::TCRETURNri64:
24344 case X86::TCRETURNmi64:
24345 return BB;
24346 case X86::TLS_addr32:
24347 case X86::TLS_addr64:
24348 case X86::TLS_base_addr32:
24349 case X86::TLS_base_addr64:
24350 return EmitLoweredTLSAddr(MI, BB);
24351 case X86::CATCHRET:
24352 return EmitLoweredCatchRet(MI, BB);
24353 case X86::CATCHPAD:
24354 return EmitLoweredCatchPad(MI, BB);
24355 case X86::SEG_ALLOCA_32:
24356 case X86::SEG_ALLOCA_64:
24357 return EmitLoweredSegAlloca(MI, BB);
24358 case X86::TLSCall_32:
24359 case X86::TLSCall_64:
24360 return EmitLoweredTLSCall(MI, BB);
24361 case X86::CMOV_FR32:
24362 case X86::CMOV_FR64:
24363 case X86::CMOV_FR128:
24364 case X86::CMOV_GR8:
24365 case X86::CMOV_GR16:
24366 case X86::CMOV_GR32:
24367 case X86::CMOV_RFP32:
24368 case X86::CMOV_RFP64:
24369 case X86::CMOV_RFP80:
24370 case X86::CMOV_V2F64:
24371 case X86::CMOV_V2I64:
24372 case X86::CMOV_V4F32:
24373 case X86::CMOV_V4F64:
24374 case X86::CMOV_V4I64:
24375 case X86::CMOV_V16F32:
24376 case X86::CMOV_V8F32:
24377 case X86::CMOV_V8F64:
24378 case X86::CMOV_V8I64:
24379 case X86::CMOV_V8I1:
24380 case X86::CMOV_V16I1:
24381 case X86::CMOV_V32I1:
24382 case X86::CMOV_V64I1:
24383 return EmitLoweredSelect(MI, BB);
24384
24385 case X86::RDFLAGS32:
24386 case X86::RDFLAGS64: {
24387 DebugLoc DL = MI.getDebugLoc();
24388 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24389 unsigned PushF =
24390 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
24391 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
24392 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
24393 // Permit reads of the FLAGS register without it being defined.
24394 // This intrinsic exists to read external processor state in flags, such as
24395 // the trap flag, interrupt flag, and direction flag, none of which are
24396 // modeled by the backend.
24397 Push->getOperand(2).setIsUndef();
24398 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
24399
24400 MI.eraseFromParent(); // The pseudo is gone now.
24401 return BB;
24402 }
24403
24404 case X86::WRFLAGS32:
24405 case X86::WRFLAGS64: {
24406 DebugLoc DL = MI.getDebugLoc();
24407 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24408 unsigned Push =
24409 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
24410 unsigned PopF =
24411 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
24412 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
24413 BuildMI(*BB, MI, DL, TII->get(PopF));
24414
24415 MI.eraseFromParent(); // The pseudo is gone now.
24416 return BB;
24417 }
24418
24419 case X86::RELEASE_FADD32mr:
24420 case X86::RELEASE_FADD64mr:
24421 return EmitLoweredAtomicFP(MI, BB);
24422
24423 case X86::FP32_TO_INT16_IN_MEM:
24424 case X86::FP32_TO_INT32_IN_MEM:
24425 case X86::FP32_TO_INT64_IN_MEM:
24426 case X86::FP64_TO_INT16_IN_MEM:
24427 case X86::FP64_TO_INT32_IN_MEM:
24428 case X86::FP64_TO_INT64_IN_MEM:
24429 case X86::FP80_TO_INT16_IN_MEM:
24430 case X86::FP80_TO_INT32_IN_MEM:
24431 case X86::FP80_TO_INT64_IN_MEM: {
24432 MachineFunction *F = BB->getParent();
24433 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24434 DebugLoc DL = MI.getDebugLoc();
24435
24436 // Change the floating point control register to use "round towards zero"
24437 // mode when truncating to an integer value.
24438 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
24439 addFrameReference(BuildMI(*BB, MI, DL,
24440 TII->get(X86::FNSTCW16m)), CWFrameIdx);
24441
24442 // Load the old value of the high byte of the control word...
24443 unsigned OldCW =
24444 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
24445 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
24446 CWFrameIdx);
24447
24448 // Set the high part to be round to zero...
24449 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
24450 .addImm(0xC7F);
24451
24452 // Reload the modified control word now...
24453 addFrameReference(BuildMI(*BB, MI, DL,
24454 TII->get(X86::FLDCW16m)), CWFrameIdx);
24455
24456 // Restore the memory image of control word to original value
24457 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
24458 .addReg(OldCW);
24459
24460 // Get the X86 opcode to use.
24461 unsigned Opc;
24462 switch (MI.getOpcode()) {
24463 default: llvm_unreachable("illegal opcode!");
24464 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
24465 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
24466 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
24467 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
24468 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
24469 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
24470 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
24471 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
24472 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
24473 }
24474
24475 X86AddressMode AM = getAddressFromInstr(&MI, 0);
24476 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
24477 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
24478
24479 // Reload the original control word now.
24480 addFrameReference(BuildMI(*BB, MI, DL,
24481 TII->get(X86::FLDCW16m)), CWFrameIdx);
24482
24483 MI.eraseFromParent(); // The pseudo instruction is gone now.
24484 return BB;
24485 }
24486 // String/text processing lowering.
24487 case X86::PCMPISTRM128REG:
24488 case X86::VPCMPISTRM128REG:
24489 case X86::PCMPISTRM128MEM:
24490 case X86::VPCMPISTRM128MEM:
24491 case X86::PCMPESTRM128REG:
24492 case X86::VPCMPESTRM128REG:
24493 case X86::PCMPESTRM128MEM:
24494 case X86::VPCMPESTRM128MEM:
24495 assert(Subtarget.hasSSE42() &&
24496 "Target must have SSE4.2 or AVX features enabled");
24497 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
24498
24499 // String/text processing lowering.
24500 case X86::PCMPISTRIREG:
24501 case X86::VPCMPISTRIREG:
24502 case X86::PCMPISTRIMEM:
24503 case X86::VPCMPISTRIMEM:
24504 case X86::PCMPESTRIREG:
24505 case X86::VPCMPESTRIREG:
24506 case X86::PCMPESTRIMEM:
24507 case X86::VPCMPESTRIMEM:
24508 assert(Subtarget.hasSSE42() &&
24509 "Target must have SSE4.2 or AVX features enabled");
24510 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
24511
24512 // Thread synchronization.
24513 case X86::MONITOR:
24514 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
24515 case X86::MONITORX:
24516 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
24517 // PKU feature
24518 case X86::WRPKRU:
24519 return emitWRPKRU(MI, BB, Subtarget);
24520 case X86::RDPKRU:
24521 return emitRDPKRU(MI, BB, Subtarget);
24522 // xbegin
24523 case X86::XBEGIN:
24524 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
24525
24526 case X86::VASTART_SAVE_XMM_REGS:
24527 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
24528
24529 case X86::VAARG_64:
24530 return EmitVAARG64WithCustomInserter(MI, BB);
24531
24532 case X86::EH_SjLj_SetJmp32:
24533 case X86::EH_SjLj_SetJmp64:
24534 return emitEHSjLjSetJmp(MI, BB);
24535
24536 case X86::EH_SjLj_LongJmp32:
24537 case X86::EH_SjLj_LongJmp64:
24538 return emitEHSjLjLongJmp(MI, BB);
24539
24540 case X86::Int_eh_sjlj_setup_dispatch:
24541 return EmitSjLjDispatchBlock(MI, BB);
24542
24543 case TargetOpcode::STATEPOINT:
24544 // As an implementation detail, STATEPOINT shares the STACKMAP format at
24545 // this point in the process. We diverge later.
24546 return emitPatchPoint(MI, BB);
24547
24548 case TargetOpcode::STACKMAP:
24549 case TargetOpcode::PATCHPOINT:
24550 return emitPatchPoint(MI, BB);
24551
24552 case X86::VFMADDPDr213r:
24553 case X86::VFMADDPSr213r:
24554 case X86::VFMADDSDr213r:
24555 case X86::VFMADDSSr213r:
24556 case X86::VFMSUBPDr213r:
24557 case X86::VFMSUBPSr213r:
24558 case X86::VFMSUBSDr213r:
24559 case X86::VFMSUBSSr213r:
24560 case X86::VFNMADDPDr213r:
24561 case X86::VFNMADDPSr213r:
24562 case X86::VFNMADDSDr213r:
24563 case X86::VFNMADDSSr213r:
24564 case X86::VFNMSUBPDr213r:
24565 case X86::VFNMSUBPSr213r:
24566 case X86::VFNMSUBSDr213r:
24567 case X86::VFNMSUBSSr213r:
24568 case X86::VFMADDSUBPDr213r:
24569 case X86::VFMADDSUBPSr213r:
24570 case X86::VFMSUBADDPDr213r:
24571 case X86::VFMSUBADDPSr213r:
24572 case X86::VFMADDPDr213rY:
24573 case X86::VFMADDPSr213rY:
24574 case X86::VFMSUBPDr213rY:
24575 case X86::VFMSUBPSr213rY:
24576 case X86::VFNMADDPDr213rY:
24577 case X86::VFNMADDPSr213rY:
24578 case X86::VFNMSUBPDr213rY:
24579 case X86::VFNMSUBPSr213rY:
24580 case X86::VFMADDSUBPDr213rY:
24581 case X86::VFMADDSUBPSr213rY:
24582 case X86::VFMSUBADDPDr213rY:
24583 case X86::VFMSUBADDPSr213rY:
24584 return emitFMA3Instr(MI, BB);
24585 case X86::LCMPXCHG8B_SAVE_EBX:
24586 case X86::LCMPXCHG16B_SAVE_RBX: {
24587 unsigned BasePtr =
24588 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
24589 if (!BB->isLiveIn(BasePtr))
24590 BB->addLiveIn(BasePtr);
24591 return BB;
24592 }
24593 }
24594 }
24595
24596 //===----------------------------------------------------------------------===//
24597 // X86 Optimization Hooks
24598 //===----------------------------------------------------------------------===//
24599
computeKnownBitsForTargetNode(const SDValue Op,APInt & KnownZero,APInt & KnownOne,const SelectionDAG & DAG,unsigned Depth) const24600 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
24601 APInt &KnownZero,
24602 APInt &KnownOne,
24603 const SelectionDAG &DAG,
24604 unsigned Depth) const {
24605 unsigned BitWidth = KnownZero.getBitWidth();
24606 unsigned Opc = Op.getOpcode();
24607 assert((Opc >= ISD::BUILTIN_OP_END ||
24608 Opc == ISD::INTRINSIC_WO_CHAIN ||
24609 Opc == ISD::INTRINSIC_W_CHAIN ||
24610 Opc == ISD::INTRINSIC_VOID) &&
24611 "Should use MaskedValueIsZero if you don't know whether Op"
24612 " is a target node!");
24613
24614 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
24615 switch (Opc) {
24616 default: break;
24617 case X86ISD::ADD:
24618 case X86ISD::SUB:
24619 case X86ISD::ADC:
24620 case X86ISD::SBB:
24621 case X86ISD::SMUL:
24622 case X86ISD::UMUL:
24623 case X86ISD::INC:
24624 case X86ISD::DEC:
24625 case X86ISD::OR:
24626 case X86ISD::XOR:
24627 case X86ISD::AND:
24628 // These nodes' second result is a boolean.
24629 if (Op.getResNo() == 0)
24630 break;
24631 // Fallthrough
24632 case X86ISD::SETCC:
24633 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
24634 break;
24635 case X86ISD::MOVMSK: {
24636 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
24637 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
24638 break;
24639 }
24640 }
24641 }
24642
ComputeNumSignBitsForTargetNode(SDValue Op,const SelectionDAG &,unsigned Depth) const24643 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
24644 SDValue Op,
24645 const SelectionDAG &,
24646 unsigned Depth) const {
24647 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
24648 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
24649 return Op.getValueType().getScalarSizeInBits();
24650
24651 // Fallback case.
24652 return 1;
24653 }
24654
24655 /// Returns true (and the GlobalValue and the offset) if the node is a
24656 /// GlobalAddress + offset.
isGAPlusOffset(SDNode * N,const GlobalValue * & GA,int64_t & Offset) const24657 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
24658 const GlobalValue* &GA,
24659 int64_t &Offset) const {
24660 if (N->getOpcode() == X86ISD::Wrapper) {
24661 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
24662 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
24663 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
24664 return true;
24665 }
24666 }
24667 return TargetLowering::isGAPlusOffset(N, GA, Offset);
24668 }
24669
24670 /// Performs shuffle combines for 256-bit vectors.
24671 /// FIXME: This could be expanded to support 512 bit vectors as well.
combineShuffle256(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)24672 static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
24673 TargetLowering::DAGCombinerInfo &DCI,
24674 const X86Subtarget &Subtarget) {
24675 SDLoc dl(N);
24676 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
24677 SDValue V1 = SVOp->getOperand(0);
24678 SDValue V2 = SVOp->getOperand(1);
24679 MVT VT = SVOp->getSimpleValueType(0);
24680 unsigned NumElems = VT.getVectorNumElements();
24681
24682 if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
24683 V2.getOpcode() == ISD::CONCAT_VECTORS) {
24684 //
24685 // 0,0,0,...
24686 // |
24687 // V UNDEF BUILD_VECTOR UNDEF
24688 // \ / \ /
24689 // CONCAT_VECTOR CONCAT_VECTOR
24690 // \ /
24691 // \ /
24692 // RESULT: V + zero extended
24693 //
24694 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
24695 !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
24696 return SDValue();
24697
24698 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
24699 return SDValue();
24700
24701 // To match the shuffle mask, the first half of the mask should
24702 // be exactly the first vector, and all the rest a splat with the
24703 // first element of the second one.
24704 for (unsigned i = 0; i != NumElems/2; ++i)
24705 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
24706 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
24707 return SDValue();
24708
24709 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
24710 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
24711 if (Ld->hasNUsesOfValue(1, 0)) {
24712 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
24713 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
24714 SDValue ResNode =
24715 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
24716 Ld->getMemoryVT(),
24717 Ld->getPointerInfo(),
24718 Ld->getAlignment(),
24719 false/*isVolatile*/, true/*ReadMem*/,
24720 false/*WriteMem*/);
24721
24722 // Make sure the newly-created LOAD is in the same position as Ld in
24723 // terms of dependency. We create a TokenFactor for Ld and ResNode,
24724 // and update uses of Ld's output chain to use the TokenFactor.
24725 if (Ld->hasAnyUseOfValue(1)) {
24726 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24727 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
24728 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
24729 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
24730 SDValue(ResNode.getNode(), 1));
24731 }
24732
24733 return DAG.getBitcast(VT, ResNode);
24734 }
24735 }
24736
24737 // Emit a zeroed vector and insert the desired subvector on its
24738 // first half.
24739 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
24740 SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
24741 return DCI.CombineTo(N, InsV);
24742 }
24743
24744 return SDValue();
24745 }
24746
24747 // Attempt to match a combined shuffle mask against supported unary shuffle
24748 // instructions.
24749 // TODO: Investigate sharing more of this with shuffle lowering.
matchUnaryVectorShuffle(MVT SrcVT,ArrayRef<int> Mask,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT)24750 static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24751 const X86Subtarget &Subtarget,
24752 unsigned &Shuffle, MVT &ShuffleVT) {
24753 bool FloatDomain = SrcVT.isFloatingPoint() ||
24754 (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
24755
24756 // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
24757 if (!FloatDomain && SrcVT.is128BitVector() &&
24758 isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
24759 Shuffle = X86ISD::VZEXT_MOVL;
24760 ShuffleVT = MVT::v2i64;
24761 return true;
24762 }
24763
24764 // Check if we have SSE3 which will let us use MOVDDUP etc. The
24765 // instructions are no slower than UNPCKLPD but has the option to
24766 // fold the input operand into even an unaligned memory load.
24767 if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
24768 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
24769 Shuffle = X86ISD::MOVDDUP;
24770 ShuffleVT = MVT::v2f64;
24771 return true;
24772 }
24773 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24774 Shuffle = X86ISD::MOVSLDUP;
24775 ShuffleVT = MVT::v4f32;
24776 return true;
24777 }
24778 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
24779 Shuffle = X86ISD::MOVSHDUP;
24780 ShuffleVT = MVT::v4f32;
24781 return true;
24782 }
24783 }
24784
24785 if (SrcVT.is256BitVector() && FloatDomain) {
24786 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
24787 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24788 Shuffle = X86ISD::MOVDDUP;
24789 ShuffleVT = MVT::v4f64;
24790 return true;
24791 }
24792 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24793 Shuffle = X86ISD::MOVSLDUP;
24794 ShuffleVT = MVT::v8f32;
24795 return true;
24796 }
24797 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
24798 Shuffle = X86ISD::MOVSHDUP;
24799 ShuffleVT = MVT::v8f32;
24800 return true;
24801 }
24802 }
24803
24804 if (SrcVT.is512BitVector() && FloatDomain) {
24805 assert(Subtarget.hasAVX512() &&
24806 "AVX512 required for 512-bit vector shuffles");
24807 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24808 Shuffle = X86ISD::MOVDDUP;
24809 ShuffleVT = MVT::v8f64;
24810 return true;
24811 }
24812 if (isTargetShuffleEquivalent(
24813 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
24814 Shuffle = X86ISD::MOVSLDUP;
24815 ShuffleVT = MVT::v16f32;
24816 return true;
24817 }
24818 if (isTargetShuffleEquivalent(
24819 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
24820 Shuffle = X86ISD::MOVSHDUP;
24821 ShuffleVT = MVT::v16f32;
24822 return true;
24823 }
24824 }
24825
24826 // Attempt to match against broadcast-from-vector.
24827 if (Subtarget.hasAVX2()) {
24828 unsigned NumElts = Mask.size();
24829 SmallVector<int, 64> BroadcastMask(NumElts, 0);
24830 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
24831 unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
24832 ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
24833 : MVT::getIntegerVT(EltSize);
24834 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
24835 Shuffle = X86ISD::VBROADCAST;
24836 return true;
24837 }
24838 }
24839
24840 return false;
24841 }
24842
24843 // Attempt to match a combined shuffle mask against supported unary immediate
24844 // permute instructions.
24845 // TODO: Investigate sharing more of this with shuffle lowering.
matchPermuteVectorShuffle(MVT SrcVT,ArrayRef<int> Mask,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT,unsigned & PermuteImm)24846 static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24847 const X86Subtarget &Subtarget,
24848 unsigned &Shuffle, MVT &ShuffleVT,
24849 unsigned &PermuteImm) {
24850 // Ensure we don't contain any zero elements.
24851 for (int M : Mask) {
24852 if (M == SM_SentinelZero)
24853 return false;
24854 assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
24855 "Expected unary shuffle");
24856 }
24857
24858 unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
24859 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
24860
24861 // Handle PSHUFLW/PSHUFHW repeated patterns.
24862 if (MaskScalarSizeInBits == 16) {
24863 SmallVector<int, 4> RepeatedMask;
24864 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
24865 ArrayRef<int> LoMask(Mask.data() + 0, 4);
24866 ArrayRef<int> HiMask(Mask.data() + 4, 4);
24867
24868 // PSHUFLW: permute lower 4 elements only.
24869 if (isUndefOrInRange(LoMask, 0, 4) &&
24870 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
24871 Shuffle = X86ISD::PSHUFLW;
24872 ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24873 PermuteImm = getV4X86ShuffleImm(LoMask);
24874 return true;
24875 }
24876
24877 // PSHUFHW: permute upper 4 elements only.
24878 if (isUndefOrInRange(HiMask, 4, 8) &&
24879 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
24880 // Offset the HiMask so that we can create the shuffle immediate.
24881 int OffsetHiMask[4];
24882 for (int i = 0; i != 4; ++i)
24883 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
24884
24885 Shuffle = X86ISD::PSHUFHW;
24886 ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24887 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
24888 return true;
24889 }
24890
24891 return false;
24892 }
24893 return false;
24894 }
24895
24896 // We only support permutation of 32/64 bit elements after this.
24897 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
24898 return false;
24899
24900 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
24901 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
24902 bool FloatDomain = SrcVT.isFloatingPoint();
24903 if (FloatDomain && !Subtarget.hasAVX())
24904 return false;
24905
24906 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
24907 if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
24908 FloatDomain = true;
24909
24910 // Check for lane crossing permutes.
24911 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
24912 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
24913 if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
24914 Shuffle = X86ISD::VPERMI;
24915 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
24916 PermuteImm = getV4X86ShuffleImm(Mask);
24917 return true;
24918 }
24919 if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
24920 SmallVector<int, 4> RepeatedMask;
24921 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
24922 Shuffle = X86ISD::VPERMI;
24923 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
24924 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
24925 return true;
24926 }
24927 }
24928 return false;
24929 }
24930
24931 // VPERMILPD can permute with a non-repeating shuffle.
24932 if (FloatDomain && MaskScalarSizeInBits == 64) {
24933 Shuffle = X86ISD::VPERMILPI;
24934 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
24935 PermuteImm = 0;
24936 for (int i = 0, e = Mask.size(); i != e; ++i) {
24937 int M = Mask[i];
24938 if (M == SM_SentinelUndef)
24939 continue;
24940 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
24941 PermuteImm |= (M & 1) << i;
24942 }
24943 return true;
24944 }
24945
24946 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
24947 SmallVector<int, 4> RepeatedMask;
24948 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
24949 return false;
24950
24951 // Narrow the repeated mask for 32-bit element permutes.
24952 SmallVector<int, 4> WordMask = RepeatedMask;
24953 if (MaskScalarSizeInBits == 64)
24954 scaleShuffleMask(2, RepeatedMask, WordMask);
24955
24956 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
24957 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
24958 ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
24959 PermuteImm = getV4X86ShuffleImm(WordMask);
24960 return true;
24961 }
24962
24963 // Attempt to match a combined unary shuffle mask against supported binary
24964 // shuffle instructions.
24965 // TODO: Investigate sharing more of this with shuffle lowering.
matchBinaryVectorShuffle(MVT SrcVT,ArrayRef<int> Mask,unsigned & Shuffle,MVT & ShuffleVT)24966 static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24967 unsigned &Shuffle, MVT &ShuffleVT) {
24968 bool FloatDomain = SrcVT.isFloatingPoint();
24969
24970 if (SrcVT.is128BitVector()) {
24971 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
24972 Shuffle = X86ISD::MOVLHPS;
24973 ShuffleVT = MVT::v4f32;
24974 return true;
24975 }
24976 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
24977 Shuffle = X86ISD::MOVHLPS;
24978 ShuffleVT = MVT::v4f32;
24979 return true;
24980 }
24981 if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
24982 Shuffle = X86ISD::UNPCKL;
24983 ShuffleVT = MVT::v4f32;
24984 return true;
24985 }
24986 if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
24987 Shuffle = X86ISD::UNPCKH;
24988 ShuffleVT = MVT::v4f32;
24989 return true;
24990 }
24991 if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
24992 isTargetShuffleEquivalent(
24993 Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
24994 Shuffle = X86ISD::UNPCKL;
24995 ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
24996 return true;
24997 }
24998 if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
24999 isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
25000 13, 14, 14, 15, 15})) {
25001 Shuffle = X86ISD::UNPCKH;
25002 ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
25003 return true;
25004 }
25005 }
25006
25007 return false;
25008 }
25009
25010 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
25011 /// possible.
25012 ///
25013 /// This is the leaf of the recursive combine below. When we have found some
25014 /// chain of single-use x86 shuffle instructions and accumulated the combined
25015 /// shuffle mask represented by them, this will try to pattern match that mask
25016 /// into either a single instruction if there is a special purpose instruction
25017 /// for this operation, or into a PSHUFB instruction which is a fully general
25018 /// instruction but should only be used to replace chains over a certain depth.
combineX86ShuffleChain(SDValue Input,SDValue Root,ArrayRef<int> BaseMask,int Depth,bool HasVariableMask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)25019 static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
25020 ArrayRef<int> BaseMask, int Depth,
25021 bool HasVariableMask, SelectionDAG &DAG,
25022 TargetLowering::DAGCombinerInfo &DCI,
25023 const X86Subtarget &Subtarget) {
25024 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
25025
25026 // Find the operand that enters the chain. Note that multiple uses are OK
25027 // here, we're not going to remove the operand we find.
25028 Input = peekThroughBitcasts(Input);
25029
25030 MVT VT = Input.getSimpleValueType();
25031 MVT RootVT = Root.getSimpleValueType();
25032 SDLoc DL(Root);
25033
25034 SDValue Res;
25035
25036 unsigned NumBaseMaskElts = BaseMask.size();
25037 if (NumBaseMaskElts == 1) {
25038 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
25039 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
25040 /*AddTo*/ true);
25041 return true;
25042 }
25043
25044 unsigned RootSizeInBits = RootVT.getSizeInBits();
25045 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
25046
25047 // Don't combine if we are a AVX512/EVEX target and the mask element size
25048 // is different from the root element size - this would prevent writemasks
25049 // from being reused.
25050 // TODO - this currently prevents all lane shuffles from occurring.
25051 // TODO - check for writemasks usage instead of always preventing combining.
25052 // TODO - attempt to narrow Mask back to writemask size.
25053 if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
25054 (RootSizeInBits == 512 ||
25055 (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
25056 return false;
25057 }
25058
25059 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
25060
25061 // Handle 128-bit lane shuffles of 256-bit vectors.
25062 if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
25063 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
25064 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
25065 return false; // Nothing to do!
25066 MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
25067 : MVT::v4i64);
25068 unsigned PermMask = 0;
25069 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
25070 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
25071
25072 Res = DAG.getBitcast(ShuffleVT, Input);
25073 DCI.AddToWorklist(Res.getNode());
25074 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
25075 DAG.getUNDEF(ShuffleVT),
25076 DAG.getConstant(PermMask, DL, MVT::i8));
25077 DCI.AddToWorklist(Res.getNode());
25078 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25079 /*AddTo*/ true);
25080 return true;
25081 }
25082
25083 // For masks that have been widened to 128-bit elements or more,
25084 // narrow back down to 64-bit elements.
25085 SmallVector<int, 64> Mask;
25086 if (BaseMaskEltSizeInBits > 64) {
25087 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
25088 int MaskScale = BaseMaskEltSizeInBits / 64;
25089 scaleShuffleMask(MaskScale, BaseMask, Mask);
25090 } else {
25091 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
25092 }
25093
25094 unsigned NumMaskElts = Mask.size();
25095 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
25096
25097 // Determine the effective mask value type.
25098 bool FloatDomain =
25099 (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
25100 (32 <= MaskEltSizeInBits);
25101 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
25102 : MVT::getIntegerVT(MaskEltSizeInBits);
25103 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
25104
25105 // Attempt to match the mask against known shuffle patterns.
25106 MVT ShuffleVT;
25107 unsigned Shuffle, PermuteImm;
25108
25109 if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
25110 if (Depth == 1 && Root.getOpcode() == Shuffle)
25111 return false; // Nothing to do!
25112 Res = DAG.getBitcast(ShuffleVT, Input);
25113 DCI.AddToWorklist(Res.getNode());
25114 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
25115 DCI.AddToWorklist(Res.getNode());
25116 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25117 /*AddTo*/ true);
25118 return true;
25119 }
25120
25121 if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
25122 PermuteImm)) {
25123 if (Depth == 1 && Root.getOpcode() == Shuffle)
25124 return false; // Nothing to do!
25125 Res = DAG.getBitcast(ShuffleVT, Input);
25126 DCI.AddToWorklist(Res.getNode());
25127 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
25128 DAG.getConstant(PermuteImm, DL, MVT::i8));
25129 DCI.AddToWorklist(Res.getNode());
25130 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25131 /*AddTo*/ true);
25132 return true;
25133 }
25134
25135 if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
25136 if (Depth == 1 && Root.getOpcode() == Shuffle)
25137 return false; // Nothing to do!
25138 Res = DAG.getBitcast(ShuffleVT, Input);
25139 DCI.AddToWorklist(Res.getNode());
25140 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
25141 DCI.AddToWorklist(Res.getNode());
25142 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25143 /*AddTo*/ true);
25144 return true;
25145 }
25146
25147 // Attempt to blend with zero.
25148 if (NumMaskElts <= 8 &&
25149 ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
25150 (Subtarget.hasAVX() && VT.is256BitVector()))) {
25151 // Convert VT to a type compatible with X86ISD::BLENDI.
25152 // TODO - add 16i16 support (requires lane duplication).
25153 MVT ShuffleVT = MaskVT;
25154 if (Subtarget.hasAVX2()) {
25155 if (ShuffleVT == MVT::v4i64)
25156 ShuffleVT = MVT::v8i32;
25157 else if (ShuffleVT == MVT::v2i64)
25158 ShuffleVT = MVT::v4i32;
25159 } else {
25160 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
25161 ShuffleVT = MVT::v8i16;
25162 else if (ShuffleVT == MVT::v4i64)
25163 ShuffleVT = MVT::v4f64;
25164 else if (ShuffleVT == MVT::v8i32)
25165 ShuffleVT = MVT::v8f32;
25166 }
25167
25168 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
25169 /*Low*/ 0) &&
25170 NumMaskElts <= ShuffleVT.getVectorNumElements()) {
25171 unsigned BlendMask = 0;
25172 unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
25173 unsigned MaskRatio = ShuffleSize / NumMaskElts;
25174
25175 if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
25176 return false;
25177
25178 for (unsigned i = 0; i != ShuffleSize; ++i)
25179 if (Mask[i / MaskRatio] < 0)
25180 BlendMask |= 1u << i;
25181
25182 SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
25183 Res = DAG.getBitcast(ShuffleVT, Input);
25184 DCI.AddToWorklist(Res.getNode());
25185 Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
25186 DAG.getConstant(BlendMask, DL, MVT::i8));
25187 DCI.AddToWorklist(Res.getNode());
25188 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25189 /*AddTo*/ true);
25190 return true;
25191 }
25192 }
25193
25194 // Attempt to combine to INSERTPS.
25195 if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
25196 (VT == MVT::v2f64 || VT == MVT::v4f32)) {
25197 SmallBitVector Zeroable(4, false);
25198 for (unsigned i = 0; i != NumMaskElts; ++i)
25199 if (Mask[i] < 0)
25200 Zeroable[i] = true;
25201
25202 unsigned InsertPSMask;
25203 SDValue V1 = Input, V2 = Input;
25204 if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
25205 Zeroable, Mask, DAG)) {
25206 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
25207 return false; // Nothing to do!
25208 V1 = DAG.getBitcast(MVT::v4f32, V1);
25209 DCI.AddToWorklist(V1.getNode());
25210 V2 = DAG.getBitcast(MVT::v4f32, V2);
25211 DCI.AddToWorklist(V2.getNode());
25212 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
25213 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25214 DCI.AddToWorklist(Res.getNode());
25215 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25216 /*AddTo*/ true);
25217 return true;
25218 }
25219 }
25220
25221 // Don't try to re-form single instruction chains under any circumstances now
25222 // that we've done encoding canonicalization for them.
25223 if (Depth < 2)
25224 return false;
25225
25226 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
25227 return false;
25228
25229 bool MaskContainsZeros =
25230 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
25231
25232 // If we have a single input shuffle with different shuffle patterns in the
25233 // the 128-bit lanes use the variable mask to VPERMILPS.
25234 // TODO Combine other mask types at higher depths.
25235 if (HasVariableMask && !MaskContainsZeros &&
25236 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
25237 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
25238 SmallVector<SDValue, 16> VPermIdx;
25239 for (int M : Mask) {
25240 SDValue Idx =
25241 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
25242 VPermIdx.push_back(Idx);
25243 }
25244 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
25245 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
25246 DCI.AddToWorklist(VPermMask.getNode());
25247 Res = DAG.getBitcast(MaskVT, Input);
25248 DCI.AddToWorklist(Res.getNode());
25249 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
25250 DCI.AddToWorklist(Res.getNode());
25251 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25252 /*AddTo*/ true);
25253 return true;
25254 }
25255
25256 // If we have 3 or more shuffle instructions or a chain involving a variable
25257 // mask, we can replace them with a single PSHUFB instruction profitably.
25258 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
25259 // instructions, but in practice PSHUFB tends to be *very* fast so we're
25260 // more aggressive.
25261 if ((Depth >= 3 || HasVariableMask) &&
25262 ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
25263 (VT.is256BitVector() && Subtarget.hasAVX2()) ||
25264 (VT.is512BitVector() && Subtarget.hasBWI()))) {
25265 SmallVector<SDValue, 16> PSHUFBMask;
25266 int NumBytes = VT.getSizeInBits() / 8;
25267 int Ratio = NumBytes / NumMaskElts;
25268 for (int i = 0; i < NumBytes; ++i) {
25269 int M = Mask[i / Ratio];
25270 if (M == SM_SentinelUndef) {
25271 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
25272 continue;
25273 }
25274 if (M == SM_SentinelZero) {
25275 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
25276 continue;
25277 }
25278 M = Ratio * M + i % Ratio;
25279 assert ((M / 16) == (i / 16) && "Lane crossing detected");
25280 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
25281 }
25282 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
25283 Res = DAG.getBitcast(ByteVT, Input);
25284 DCI.AddToWorklist(Res.getNode());
25285 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
25286 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
25287 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
25288 DCI.AddToWorklist(Res.getNode());
25289 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25290 /*AddTo*/ true);
25291 return true;
25292 }
25293
25294 // Failed to find any combines.
25295 return false;
25296 }
25297
25298 /// \brief Fully generic combining of x86 shuffle instructions.
25299 ///
25300 /// This should be the last combine run over the x86 shuffle instructions. Once
25301 /// they have been fully optimized, this will recursively consider all chains
25302 /// of single-use shuffle instructions, build a generic model of the cumulative
25303 /// shuffle operation, and check for simpler instructions which implement this
25304 /// operation. We use this primarily for two purposes:
25305 ///
25306 /// 1) Collapse generic shuffles to specialized single instructions when
25307 /// equivalent. In most cases, this is just an encoding size win, but
25308 /// sometimes we will collapse multiple generic shuffles into a single
25309 /// special-purpose shuffle.
25310 /// 2) Look for sequences of shuffle instructions with 3 or more total
25311 /// instructions, and replace them with the slightly more expensive SSSE3
25312 /// PSHUFB instruction if available. We do this as the last combining step
25313 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
25314 /// a suitable short sequence of other instructions. The PHUFB will either
25315 /// use a register or have to read from memory and so is slightly (but only
25316 /// slightly) more expensive than the other shuffle instructions.
25317 ///
25318 /// Because this is inherently a quadratic operation (for each shuffle in
25319 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
25320 /// This should never be an issue in practice as the shuffle lowering doesn't
25321 /// produce sequences of more than 8 instructions.
25322 ///
25323 /// FIXME: We will currently miss some cases where the redundant shuffling
25324 /// would simplify under the threshold for PSHUFB formation because of
25325 /// combine-ordering. To fix this, we should do the redundant instruction
25326 /// combining in this recursive walk.
combineX86ShufflesRecursively(SDValue Op,SDValue Root,ArrayRef<int> RootMask,int Depth,bool HasVariableMask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)25327 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
25328 ArrayRef<int> RootMask,
25329 int Depth, bool HasVariableMask,
25330 SelectionDAG &DAG,
25331 TargetLowering::DAGCombinerInfo &DCI,
25332 const X86Subtarget &Subtarget) {
25333 // Bound the depth of our recursive combine because this is ultimately
25334 // quadratic in nature.
25335 if (Depth > 8)
25336 return false;
25337
25338 // Directly rip through bitcasts to find the underlying operand.
25339 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
25340 Op = Op.getOperand(0);
25341
25342 MVT VT = Op.getSimpleValueType();
25343 if (!VT.isVector())
25344 return false; // Bail if we hit a non-vector.
25345
25346 assert(Root.getSimpleValueType().isVector() &&
25347 "Shuffles operate on vector types!");
25348 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
25349 "Can only combine shuffles of the same vector register size.");
25350
25351 // Extract target shuffle mask and resolve sentinels and inputs.
25352 SDValue Input0, Input1;
25353 SmallVector<int, 16> OpMask;
25354 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
25355 return false;
25356
25357 assert(VT.getVectorNumElements() == OpMask.size() &&
25358 "Different mask size from vector size!");
25359 assert(((RootMask.size() > OpMask.size() &&
25360 RootMask.size() % OpMask.size() == 0) ||
25361 (OpMask.size() > RootMask.size() &&
25362 OpMask.size() % RootMask.size() == 0) ||
25363 OpMask.size() == RootMask.size()) &&
25364 "The smaller number of elements must divide the larger.");
25365 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
25366 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
25367 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
25368 assert(((RootRatio == 1 && OpRatio == 1) ||
25369 (RootRatio == 1) != (OpRatio == 1)) &&
25370 "Must not have a ratio for both incoming and op masks!");
25371
25372 SmallVector<int, 16> Mask;
25373 Mask.reserve(MaskWidth);
25374
25375 // Merge this shuffle operation's mask into our accumulated mask. Note that
25376 // this shuffle's mask will be the first applied to the input, followed by the
25377 // root mask to get us all the way to the root value arrangement. The reason
25378 // for this order is that we are recursing up the operation chain.
25379 for (int i = 0; i < MaskWidth; ++i) {
25380 int RootIdx = i / RootRatio;
25381 if (RootMask[RootIdx] < 0) {
25382 // This is a zero or undef lane, we're done.
25383 Mask.push_back(RootMask[RootIdx]);
25384 continue;
25385 }
25386
25387 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
25388 int OpIdx = RootMaskedIdx / OpRatio;
25389 if (OpMask[OpIdx] < 0) {
25390 // The incoming lanes are zero or undef, it doesn't matter which ones we
25391 // are using.
25392 Mask.push_back(OpMask[OpIdx]);
25393 continue;
25394 }
25395
25396 // Ok, we have non-zero lanes, map them through.
25397 Mask.push_back(OpMask[OpIdx] * OpRatio +
25398 RootMaskedIdx % OpRatio);
25399 }
25400
25401 // Handle the all undef/zero cases early.
25402 if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
25403 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
25404 return true;
25405 }
25406 if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
25407 // TODO - should we handle the mixed zero/undef case as well? Just returning
25408 // a zero mask will lose information on undef elements possibly reducing
25409 // future combine possibilities.
25410 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
25411 Subtarget, DAG, SDLoc(Root)));
25412 return true;
25413 }
25414
25415 int MaskSize = Mask.size();
25416 bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
25417 [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
25418 bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
25419 [MaskSize](int Idx) { return MaskSize <= Idx; });
25420
25421 // At the moment we can only combine unary shuffle mask cases.
25422 if (UseInput0 && UseInput1)
25423 return false;
25424 else if (UseInput1) {
25425 std::swap(Input0, Input1);
25426 ShuffleVectorSDNode::commuteMask(Mask);
25427 }
25428
25429 assert(Input0 && "Shuffle with no inputs detected");
25430
25431 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
25432
25433 // See if we can recurse into Input0 (if it's a target shuffle).
25434 if (Op->isOnlyUserOf(Input0.getNode()) &&
25435 combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
25436 HasVariableMask, DAG, DCI, Subtarget))
25437 return true;
25438
25439 // Minor canonicalization of the accumulated shuffle mask to make it easier
25440 // to match below. All this does is detect masks with sequential pairs of
25441 // elements, and shrink them to the half-width mask. It does this in a loop
25442 // so it will reduce the size of the mask to the minimal width mask which
25443 // performs an equivalent shuffle.
25444 SmallVector<int, 16> WidenedMask;
25445 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
25446 Mask = std::move(WidenedMask);
25447 }
25448
25449 return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
25450 DCI, Subtarget);
25451 }
25452
25453 /// \brief Get the PSHUF-style mask from PSHUF node.
25454 ///
25455 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
25456 /// PSHUF-style masks that can be reused with such instructions.
getPSHUFShuffleMask(SDValue N)25457 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
25458 MVT VT = N.getSimpleValueType();
25459 SmallVector<int, 4> Mask;
25460 SmallVector<SDValue, 2> Ops;
25461 bool IsUnary;
25462 bool HaveMask =
25463 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
25464 (void)HaveMask;
25465 assert(HaveMask);
25466
25467 // If we have more than 128-bits, only the low 128-bits of shuffle mask
25468 // matter. Check that the upper masks are repeats and remove them.
25469 if (VT.getSizeInBits() > 128) {
25470 int LaneElts = 128 / VT.getScalarSizeInBits();
25471 #ifndef NDEBUG
25472 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
25473 for (int j = 0; j < LaneElts; ++j)
25474 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
25475 "Mask doesn't repeat in high 128-bit lanes!");
25476 #endif
25477 Mask.resize(LaneElts);
25478 }
25479
25480 switch (N.getOpcode()) {
25481 case X86ISD::PSHUFD:
25482 return Mask;
25483 case X86ISD::PSHUFLW:
25484 Mask.resize(4);
25485 return Mask;
25486 case X86ISD::PSHUFHW:
25487 Mask.erase(Mask.begin(), Mask.begin() + 4);
25488 for (int &M : Mask)
25489 M -= 4;
25490 return Mask;
25491 default:
25492 llvm_unreachable("No valid shuffle instruction found!");
25493 }
25494 }
25495
25496 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
25497 ///
25498 /// We walk up the chain and look for a combinable shuffle, skipping over
25499 /// shuffles that we could hoist this shuffle's transformation past without
25500 /// altering anything.
25501 static SDValue
combineRedundantDWordShuffle(SDValue N,MutableArrayRef<int> Mask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)25502 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
25503 SelectionDAG &DAG,
25504 TargetLowering::DAGCombinerInfo &DCI) {
25505 assert(N.getOpcode() == X86ISD::PSHUFD &&
25506 "Called with something other than an x86 128-bit half shuffle!");
25507 SDLoc DL(N);
25508
25509 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
25510 // of the shuffles in the chain so that we can form a fresh chain to replace
25511 // this one.
25512 SmallVector<SDValue, 8> Chain;
25513 SDValue V = N.getOperand(0);
25514 for (; V.hasOneUse(); V = V.getOperand(0)) {
25515 switch (V.getOpcode()) {
25516 default:
25517 return SDValue(); // Nothing combined!
25518
25519 case ISD::BITCAST:
25520 // Skip bitcasts as we always know the type for the target specific
25521 // instructions.
25522 continue;
25523
25524 case X86ISD::PSHUFD:
25525 // Found another dword shuffle.
25526 break;
25527
25528 case X86ISD::PSHUFLW:
25529 // Check that the low words (being shuffled) are the identity in the
25530 // dword shuffle, and the high words are self-contained.
25531 if (Mask[0] != 0 || Mask[1] != 1 ||
25532 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
25533 return SDValue();
25534
25535 Chain.push_back(V);
25536 continue;
25537
25538 case X86ISD::PSHUFHW:
25539 // Check that the high words (being shuffled) are the identity in the
25540 // dword shuffle, and the low words are self-contained.
25541 if (Mask[2] != 2 || Mask[3] != 3 ||
25542 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
25543 return SDValue();
25544
25545 Chain.push_back(V);
25546 continue;
25547
25548 case X86ISD::UNPCKL:
25549 case X86ISD::UNPCKH:
25550 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
25551 // shuffle into a preceding word shuffle.
25552 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
25553 V.getSimpleValueType().getVectorElementType() != MVT::i16)
25554 return SDValue();
25555
25556 // Search for a half-shuffle which we can combine with.
25557 unsigned CombineOp =
25558 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
25559 if (V.getOperand(0) != V.getOperand(1) ||
25560 !V->isOnlyUserOf(V.getOperand(0).getNode()))
25561 return SDValue();
25562 Chain.push_back(V);
25563 V = V.getOperand(0);
25564 do {
25565 switch (V.getOpcode()) {
25566 default:
25567 return SDValue(); // Nothing to combine.
25568
25569 case X86ISD::PSHUFLW:
25570 case X86ISD::PSHUFHW:
25571 if (V.getOpcode() == CombineOp)
25572 break;
25573
25574 Chain.push_back(V);
25575
25576 // Fallthrough!
25577 case ISD::BITCAST:
25578 V = V.getOperand(0);
25579 continue;
25580 }
25581 break;
25582 } while (V.hasOneUse());
25583 break;
25584 }
25585 // Break out of the loop if we break out of the switch.
25586 break;
25587 }
25588
25589 if (!V.hasOneUse())
25590 // We fell out of the loop without finding a viable combining instruction.
25591 return SDValue();
25592
25593 // Merge this node's mask and our incoming mask.
25594 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25595 for (int &M : Mask)
25596 M = VMask[M];
25597 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
25598 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25599
25600 // Rebuild the chain around this new shuffle.
25601 while (!Chain.empty()) {
25602 SDValue W = Chain.pop_back_val();
25603
25604 if (V.getValueType() != W.getOperand(0).getValueType())
25605 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
25606
25607 switch (W.getOpcode()) {
25608 default:
25609 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
25610
25611 case X86ISD::UNPCKL:
25612 case X86ISD::UNPCKH:
25613 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
25614 break;
25615
25616 case X86ISD::PSHUFD:
25617 case X86ISD::PSHUFLW:
25618 case X86ISD::PSHUFHW:
25619 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
25620 break;
25621 }
25622 }
25623 if (V.getValueType() != N.getValueType())
25624 V = DAG.getBitcast(N.getValueType(), V);
25625
25626 // Return the new chain to replace N.
25627 return V;
25628 }
25629
25630 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
25631 /// pshufhw.
25632 ///
25633 /// We walk up the chain, skipping shuffles of the other half and looking
25634 /// through shuffles which switch halves trying to find a shuffle of the same
25635 /// pair of dwords.
combineRedundantHalfShuffle(SDValue N,MutableArrayRef<int> Mask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)25636 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
25637 SelectionDAG &DAG,
25638 TargetLowering::DAGCombinerInfo &DCI) {
25639 assert(
25640 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
25641 "Called with something other than an x86 128-bit half shuffle!");
25642 SDLoc DL(N);
25643 unsigned CombineOpcode = N.getOpcode();
25644
25645 // Walk up a single-use chain looking for a combinable shuffle.
25646 SDValue V = N.getOperand(0);
25647 for (; V.hasOneUse(); V = V.getOperand(0)) {
25648 switch (V.getOpcode()) {
25649 default:
25650 return false; // Nothing combined!
25651
25652 case ISD::BITCAST:
25653 // Skip bitcasts as we always know the type for the target specific
25654 // instructions.
25655 continue;
25656
25657 case X86ISD::PSHUFLW:
25658 case X86ISD::PSHUFHW:
25659 if (V.getOpcode() == CombineOpcode)
25660 break;
25661
25662 // Other-half shuffles are no-ops.
25663 continue;
25664 }
25665 // Break out of the loop if we break out of the switch.
25666 break;
25667 }
25668
25669 if (!V.hasOneUse())
25670 // We fell out of the loop without finding a viable combining instruction.
25671 return false;
25672
25673 // Combine away the bottom node as its shuffle will be accumulated into
25674 // a preceding shuffle.
25675 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25676
25677 // Record the old value.
25678 SDValue Old = V;
25679
25680 // Merge this node's mask and our incoming mask (adjusted to account for all
25681 // the pshufd instructions encountered).
25682 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25683 for (int &M : Mask)
25684 M = VMask[M];
25685 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
25686 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25687
25688 // Check that the shuffles didn't cancel each other out. If not, we need to
25689 // combine to the new one.
25690 if (Old != V)
25691 // Replace the combinable shuffle with the combined one, updating all users
25692 // so that we re-evaluate the chain here.
25693 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
25694
25695 return true;
25696 }
25697
25698 /// \brief Try to combine x86 target specific shuffles.
combineTargetShuffle(SDValue N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)25699 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
25700 TargetLowering::DAGCombinerInfo &DCI,
25701 const X86Subtarget &Subtarget) {
25702 SDLoc DL(N);
25703 MVT VT = N.getSimpleValueType();
25704 SmallVector<int, 4> Mask;
25705
25706 switch (N.getOpcode()) {
25707 case X86ISD::PSHUFD:
25708 case X86ISD::PSHUFLW:
25709 case X86ISD::PSHUFHW:
25710 Mask = getPSHUFShuffleMask(N);
25711 assert(Mask.size() == 4);
25712 break;
25713 case X86ISD::UNPCKL: {
25714 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
25715 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
25716 // moves upper half elements into the lower half part. For example:
25717 //
25718 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
25719 // undef:v16i8
25720 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
25721 //
25722 // will be combined to:
25723 //
25724 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
25725
25726 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
25727 // happen due to advanced instructions.
25728 if (!VT.is128BitVector())
25729 return SDValue();
25730
25731 auto Op0 = N.getOperand(0);
25732 auto Op1 = N.getOperand(1);
25733 if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
25734 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
25735
25736 unsigned NumElts = VT.getVectorNumElements();
25737 SmallVector<int, 8> ExpectedMask(NumElts, -1);
25738 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
25739 NumElts / 2);
25740
25741 auto ShufOp = Op1.getOperand(0);
25742 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
25743 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
25744 }
25745 return SDValue();
25746 }
25747 case X86ISD::BLENDI: {
25748 SDValue V0 = N->getOperand(0);
25749 SDValue V1 = N->getOperand(1);
25750 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
25751 "Unexpected input vector types");
25752
25753 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
25754 // operands and changing the mask to 1. This saves us a bunch of
25755 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
25756 // x86InstrInfo knows how to commute this back after instruction selection
25757 // if it would help register allocation.
25758
25759 // TODO: If optimizing for size or a processor that doesn't suffer from
25760 // partial register update stalls, this should be transformed into a MOVSD
25761 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
25762
25763 if (VT == MVT::v2f64)
25764 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
25765 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
25766 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
25767 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
25768 }
25769
25770 // Attempt to merge blend(insertps(x,y),zero).
25771 if (V0.getOpcode() == X86ISD::INSERTPS ||
25772 V1.getOpcode() == X86ISD::INSERTPS) {
25773 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25774
25775 // Determine which elements are known to be zero.
25776 SmallVector<int, 8> TargetMask;
25777 SmallVector<SDValue, 2> BlendOps;
25778 if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
25779 return SDValue();
25780
25781 // Helper function to take inner insertps node and attempt to
25782 // merge the blend with zero into its zero mask.
25783 auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
25784 if (V.getOpcode() != X86ISD::INSERTPS)
25785 return SDValue();
25786 SDValue Op0 = V.getOperand(0);
25787 SDValue Op1 = V.getOperand(1);
25788 SDValue Op2 = V.getOperand(2);
25789 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25790
25791 // Check each element of the blend node's target mask - must either
25792 // be zeroable (and update the zero mask) or selects the element from
25793 // the inner insertps node.
25794 for (int i = 0; i != 4; ++i)
25795 if (TargetMask[i] < 0)
25796 InsertPSMask |= (1u << i);
25797 else if (TargetMask[i] != (i + Offset))
25798 return SDValue();
25799 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
25800 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25801 };
25802
25803 if (SDValue V = MergeInsertPSAndBlend(V0, 0))
25804 return V;
25805 if (SDValue V = MergeInsertPSAndBlend(V1, 4))
25806 return V;
25807 }
25808 return SDValue();
25809 }
25810 case X86ISD::INSERTPS: {
25811 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25812 SDValue Op0 = N.getOperand(0);
25813 SDValue Op1 = N.getOperand(1);
25814 SDValue Op2 = N.getOperand(2);
25815 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25816 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
25817 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
25818 unsigned ZeroMask = InsertPSMask & 0xF;
25819
25820 // If we zero out all elements from Op0 then we don't need to reference it.
25821 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
25822 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
25823 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25824
25825 // If we zero out the element from Op1 then we don't need to reference it.
25826 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
25827 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25828 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25829
25830 // Attempt to merge insertps Op1 with an inner target shuffle node.
25831 SmallVector<int, 8> TargetMask1;
25832 SmallVector<SDValue, 2> Ops1;
25833 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
25834 int M = TargetMask1[SrcIdx];
25835 if (isUndefOrZero(M)) {
25836 // Zero/UNDEF insertion - zero out element and remove dependency.
25837 InsertPSMask |= (1u << DstIdx);
25838 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25839 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25840 }
25841 // Update insertps mask srcidx and reference the source input directly.
25842 assert(0 <= M && M < 8 && "Shuffle index out of range");
25843 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
25844 Op1 = Ops1[M < 4 ? 0 : 1];
25845 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25846 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25847 }
25848
25849 // Attempt to merge insertps Op0 with an inner target shuffle node.
25850 SmallVector<int, 8> TargetMask0;
25851 SmallVector<SDValue, 2> Ops0;
25852 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
25853 return SDValue();
25854
25855 bool Updated = false;
25856 bool UseInput00 = false;
25857 bool UseInput01 = false;
25858 for (int i = 0; i != 4; ++i) {
25859 int M = TargetMask0[i];
25860 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
25861 // No change if element is already zero or the inserted element.
25862 continue;
25863 } else if (isUndefOrZero(M)) {
25864 // If the target mask is undef/zero then we must zero the element.
25865 InsertPSMask |= (1u << i);
25866 Updated = true;
25867 continue;
25868 }
25869
25870 // The input vector element must be inline.
25871 if (M != i && M != (i + 4))
25872 return SDValue();
25873
25874 // Determine which inputs of the target shuffle we're using.
25875 UseInput00 |= (0 <= M && M < 4);
25876 UseInput01 |= (4 <= M);
25877 }
25878
25879 // If we're not using both inputs of the target shuffle then use the
25880 // referenced input directly.
25881 if (UseInput00 && !UseInput01) {
25882 Updated = true;
25883 Op0 = Ops0[0];
25884 } else if (!UseInput00 && UseInput01) {
25885 Updated = true;
25886 Op0 = Ops0[1];
25887 }
25888
25889 if (Updated)
25890 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25891 DAG.getConstant(InsertPSMask, DL, MVT::i8));
25892
25893 return SDValue();
25894 }
25895 default:
25896 return SDValue();
25897 }
25898
25899 // Nuke no-op shuffles that show up after combining.
25900 if (isNoopShuffleMask(Mask))
25901 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25902
25903 // Look for simplifications involving one or two shuffle instructions.
25904 SDValue V = N.getOperand(0);
25905 switch (N.getOpcode()) {
25906 default:
25907 break;
25908 case X86ISD::PSHUFLW:
25909 case X86ISD::PSHUFHW:
25910 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
25911
25912 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
25913 return SDValue(); // We combined away this shuffle, so we're done.
25914
25915 // See if this reduces to a PSHUFD which is no more expensive and can
25916 // combine with more operations. Note that it has to at least flip the
25917 // dwords as otherwise it would have been removed as a no-op.
25918 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
25919 int DMask[] = {0, 1, 2, 3};
25920 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
25921 DMask[DOffset + 0] = DOffset + 1;
25922 DMask[DOffset + 1] = DOffset + 0;
25923 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
25924 V = DAG.getBitcast(DVT, V);
25925 DCI.AddToWorklist(V.getNode());
25926 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
25927 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
25928 DCI.AddToWorklist(V.getNode());
25929 return DAG.getBitcast(VT, V);
25930 }
25931
25932 // Look for shuffle patterns which can be implemented as a single unpack.
25933 // FIXME: This doesn't handle the location of the PSHUFD generically, and
25934 // only works when we have a PSHUFD followed by two half-shuffles.
25935 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
25936 (V.getOpcode() == X86ISD::PSHUFLW ||
25937 V.getOpcode() == X86ISD::PSHUFHW) &&
25938 V.getOpcode() != N.getOpcode() &&
25939 V.hasOneUse()) {
25940 SDValue D = V.getOperand(0);
25941 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
25942 D = D.getOperand(0);
25943 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
25944 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25945 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
25946 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25947 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25948 int WordMask[8];
25949 for (int i = 0; i < 4; ++i) {
25950 WordMask[i + NOffset] = Mask[i] + NOffset;
25951 WordMask[i + VOffset] = VMask[i] + VOffset;
25952 }
25953 // Map the word mask through the DWord mask.
25954 int MappedMask[8];
25955 for (int i = 0; i < 8; ++i)
25956 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
25957 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
25958 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
25959 // We can replace all three shuffles with an unpack.
25960 V = DAG.getBitcast(VT, D.getOperand(0));
25961 DCI.AddToWorklist(V.getNode());
25962 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
25963 : X86ISD::UNPCKH,
25964 DL, VT, V, V);
25965 }
25966 }
25967 }
25968
25969 break;
25970
25971 case X86ISD::PSHUFD:
25972 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
25973 return NewN;
25974
25975 break;
25976 }
25977
25978 return SDValue();
25979 }
25980
25981 /// \brief Try to combine a shuffle into a target-specific add-sub node.
25982 ///
25983 /// We combine this directly on the abstract vector shuffle nodes so it is
25984 /// easier to generically match. We also insert dummy vector shuffle nodes for
25985 /// the operands which explicitly discard the lanes which are unused by this
25986 /// operation to try to flow through the rest of the combiner the fact that
25987 /// they're unused.
combineShuffleToAddSub(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG)25988 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
25989 SelectionDAG &DAG) {
25990 SDLoc DL(N);
25991 EVT VT = N->getValueType(0);
25992 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
25993 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
25994 return SDValue();
25995
25996 // We only handle target-independent shuffles.
25997 // FIXME: It would be easy and harmless to use the target shuffle mask
25998 // extraction tool to support more.
25999 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
26000 return SDValue();
26001
26002 auto *SVN = cast<ShuffleVectorSDNode>(N);
26003 SmallVector<int, 8> Mask;
26004 for (int M : SVN->getMask())
26005 Mask.push_back(M);
26006
26007 SDValue V1 = N->getOperand(0);
26008 SDValue V2 = N->getOperand(1);
26009
26010 // We require the first shuffle operand to be the FSUB node, and the second to
26011 // be the FADD node.
26012 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
26013 ShuffleVectorSDNode::commuteMask(Mask);
26014 std::swap(V1, V2);
26015 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
26016 return SDValue();
26017
26018 // If there are other uses of these operations we can't fold them.
26019 if (!V1->hasOneUse() || !V2->hasOneUse())
26020 return SDValue();
26021
26022 // Ensure that both operations have the same operands. Note that we can
26023 // commute the FADD operands.
26024 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
26025 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
26026 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
26027 return SDValue();
26028
26029 // We're looking for blends between FADD and FSUB nodes. We insist on these
26030 // nodes being lined up in a specific expected pattern.
26031 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
26032 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
26033 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
26034 return SDValue();
26035
26036 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
26037 }
26038
combineShuffle(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)26039 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
26040 TargetLowering::DAGCombinerInfo &DCI,
26041 const X86Subtarget &Subtarget) {
26042 SDLoc dl(N);
26043 EVT VT = N->getValueType(0);
26044
26045 // Don't create instructions with illegal types after legalize types has run.
26046 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26047 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
26048 return SDValue();
26049
26050 // If we have legalized the vector types, look for blends of FADD and FSUB
26051 // nodes that we can fuse into an ADDSUB node.
26052 if (TLI.isTypeLegal(VT))
26053 if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
26054 return AddSub;
26055
26056 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
26057 if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
26058 N->getOpcode() == ISD::VECTOR_SHUFFLE)
26059 return combineShuffle256(N, DAG, DCI, Subtarget);
26060
26061 // During Type Legalization, when promoting illegal vector types,
26062 // the backend might introduce new shuffle dag nodes and bitcasts.
26063 //
26064 // This code performs the following transformation:
26065 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
26066 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
26067 //
26068 // We do this only if both the bitcast and the BINOP dag nodes have
26069 // one use. Also, perform this transformation only if the new binary
26070 // operation is legal. This is to avoid introducing dag nodes that
26071 // potentially need to be further expanded (or custom lowered) into a
26072 // less optimal sequence of dag nodes.
26073 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
26074 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
26075 N->getOperand(0).getOpcode() == ISD::BITCAST &&
26076 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
26077 SDValue N0 = N->getOperand(0);
26078 SDValue N1 = N->getOperand(1);
26079
26080 SDValue BC0 = N0.getOperand(0);
26081 EVT SVT = BC0.getValueType();
26082 unsigned Opcode = BC0.getOpcode();
26083 unsigned NumElts = VT.getVectorNumElements();
26084
26085 if (BC0.hasOneUse() && SVT.isVector() &&
26086 SVT.getVectorNumElements() * 2 == NumElts &&
26087 TLI.isOperationLegal(Opcode, VT)) {
26088 bool CanFold = false;
26089 switch (Opcode) {
26090 default : break;
26091 case ISD::ADD :
26092 case ISD::FADD :
26093 case ISD::SUB :
26094 case ISD::FSUB :
26095 case ISD::MUL :
26096 case ISD::FMUL :
26097 CanFold = true;
26098 }
26099
26100 unsigned SVTNumElts = SVT.getVectorNumElements();
26101 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
26102 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
26103 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
26104 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
26105 CanFold = SVOp->getMaskElt(i) < 0;
26106
26107 if (CanFold) {
26108 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
26109 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
26110 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
26111 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
26112 }
26113 }
26114 }
26115
26116 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
26117 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
26118 // consecutive, non-overlapping, and in the right order.
26119 SmallVector<SDValue, 16> Elts;
26120 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
26121 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
26122
26123 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
26124 return LD;
26125
26126 if (isTargetShuffle(N->getOpcode())) {
26127 if (SDValue Shuffle =
26128 combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
26129 return Shuffle;
26130
26131 // Try recursively combining arbitrary sequences of x86 shuffle
26132 // instructions into higher-order shuffles. We do this after combining
26133 // specific PSHUF instruction sequences into their minimal form so that we
26134 // can evaluate how many specialized shuffle instructions are involved in
26135 // a particular chain.
26136 SmallVector<int, 1> NonceMask; // Just a placeholder.
26137 NonceMask.push_back(0);
26138 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
26139 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
26140 DCI, Subtarget))
26141 return SDValue(); // This routine will use CombineTo to replace N.
26142 }
26143
26144 return SDValue();
26145 }
26146
26147 /// Check if a vector extract from a target-specific shuffle of a load can be
26148 /// folded into a single element load.
26149 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
26150 /// shuffles have been custom lowered so we need to handle those here.
XFormVExtractWithShuffleIntoLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)26151 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
26152 TargetLowering::DAGCombinerInfo &DCI) {
26153 if (DCI.isBeforeLegalizeOps())
26154 return SDValue();
26155
26156 SDValue InVec = N->getOperand(0);
26157 SDValue EltNo = N->getOperand(1);
26158 EVT EltVT = N->getValueType(0);
26159
26160 if (!isa<ConstantSDNode>(EltNo))
26161 return SDValue();
26162
26163 EVT OriginalVT = InVec.getValueType();
26164
26165 if (InVec.getOpcode() == ISD::BITCAST) {
26166 // Don't duplicate a load with other uses.
26167 if (!InVec.hasOneUse())
26168 return SDValue();
26169 EVT BCVT = InVec.getOperand(0).getValueType();
26170 if (!BCVT.isVector() ||
26171 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
26172 return SDValue();
26173 InVec = InVec.getOperand(0);
26174 }
26175
26176 EVT CurrentVT = InVec.getValueType();
26177
26178 if (!isTargetShuffle(InVec.getOpcode()))
26179 return SDValue();
26180
26181 // Don't duplicate a load with other uses.
26182 if (!InVec.hasOneUse())
26183 return SDValue();
26184
26185 SmallVector<int, 16> ShuffleMask;
26186 SmallVector<SDValue, 2> ShuffleOps;
26187 bool UnaryShuffle;
26188 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
26189 ShuffleOps, ShuffleMask, UnaryShuffle))
26190 return SDValue();
26191
26192 // Select the input vector, guarding against out of range extract vector.
26193 unsigned NumElems = CurrentVT.getVectorNumElements();
26194 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
26195 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
26196
26197 if (Idx == SM_SentinelZero)
26198 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
26199 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
26200 if (Idx == SM_SentinelUndef)
26201 return DAG.getUNDEF(EltVT);
26202
26203 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
26204 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
26205 : ShuffleOps[1];
26206
26207 // If inputs to shuffle are the same for both ops, then allow 2 uses
26208 unsigned AllowedUses =
26209 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
26210
26211 if (LdNode.getOpcode() == ISD::BITCAST) {
26212 // Don't duplicate a load with other uses.
26213 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
26214 return SDValue();
26215
26216 AllowedUses = 1; // only allow 1 load use if we have a bitcast
26217 LdNode = LdNode.getOperand(0);
26218 }
26219
26220 if (!ISD::isNormalLoad(LdNode.getNode()))
26221 return SDValue();
26222
26223 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
26224
26225 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
26226 return SDValue();
26227
26228 // If there's a bitcast before the shuffle, check if the load type and
26229 // alignment is valid.
26230 unsigned Align = LN0->getAlignment();
26231 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26232 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
26233 EltVT.getTypeForEVT(*DAG.getContext()));
26234
26235 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
26236 return SDValue();
26237
26238 // All checks match so transform back to vector_shuffle so that DAG combiner
26239 // can finish the job
26240 SDLoc dl(N);
26241
26242 // Create shuffle node taking into account the case that its a unary shuffle
26243 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
26244 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
26245 ShuffleMask);
26246 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
26247 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
26248 EltNo);
26249 }
26250
combineBitcast(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)26251 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
26252 const X86Subtarget &Subtarget) {
26253 SDValue N0 = N->getOperand(0);
26254 EVT VT = N->getValueType(0);
26255
26256 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
26257 // special and don't usually play with other vector types, it's better to
26258 // handle them early to be sure we emit efficient code by avoiding
26259 // store-load conversions.
26260 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
26261 N0.getValueType() == MVT::v2i32 &&
26262 isNullConstant(N0.getOperand(1))) {
26263 SDValue N00 = N0->getOperand(0);
26264 if (N00.getValueType() == MVT::i32)
26265 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
26266 }
26267
26268 // Convert a bitcasted integer logic operation that has one bitcasted
26269 // floating-point operand and one constant operand into a floating-point
26270 // logic operation. This may create a load of the constant, but that is
26271 // cheaper than materializing the constant in an integer register and
26272 // transferring it to an SSE register or transferring the SSE operand to
26273 // integer register and back.
26274 unsigned FPOpcode;
26275 switch (N0.getOpcode()) {
26276 case ISD::AND: FPOpcode = X86ISD::FAND; break;
26277 case ISD::OR: FPOpcode = X86ISD::FOR; break;
26278 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
26279 default: return SDValue();
26280 }
26281 if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
26282 (Subtarget.hasSSE2() && VT == MVT::f64)) &&
26283 isa<ConstantSDNode>(N0.getOperand(1)) &&
26284 N0.getOperand(0).getOpcode() == ISD::BITCAST &&
26285 N0.getOperand(0).getOperand(0).getValueType() == VT) {
26286 SDValue N000 = N0.getOperand(0).getOperand(0);
26287 SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
26288 return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
26289 }
26290
26291 return SDValue();
26292 }
26293
26294 /// Detect vector gather/scatter index generation and convert it from being a
26295 /// bunch of shuffles and extracts into a somewhat faster sequence.
26296 /// For i686, the best sequence is apparently storing the value and loading
26297 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
combineExtractVectorElt(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)26298 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
26299 TargetLowering::DAGCombinerInfo &DCI) {
26300 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
26301 return NewOp;
26302
26303 SDValue InputVector = N->getOperand(0);
26304 SDLoc dl(InputVector);
26305 // Detect mmx to i32 conversion through a v2i32 elt extract.
26306 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
26307 N->getValueType(0) == MVT::i32 &&
26308 InputVector.getValueType() == MVT::v2i32 &&
26309 isa<ConstantSDNode>(N->getOperand(1)) &&
26310 N->getConstantOperandVal(1) == 0) {
26311 SDValue MMXSrc = InputVector.getNode()->getOperand(0);
26312
26313 // The bitcast source is a direct mmx result.
26314 if (MMXSrc.getValueType() == MVT::x86mmx)
26315 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
26316 }
26317
26318 EVT VT = N->getValueType(0);
26319
26320 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
26321 InputVector.getOpcode() == ISD::BITCAST &&
26322 isa<ConstantSDNode>(InputVector.getOperand(0))) {
26323 uint64_t ExtractedElt =
26324 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
26325 uint64_t InputValue =
26326 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
26327 uint64_t Res = (InputValue >> ExtractedElt) & 1;
26328 return DAG.getConstant(Res, dl, MVT::i1);
26329 }
26330 // Only operate on vectors of 4 elements, where the alternative shuffling
26331 // gets to be more expensive.
26332 if (InputVector.getValueType() != MVT::v4i32)
26333 return SDValue();
26334
26335 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
26336 // single use which is a sign-extend or zero-extend, and all elements are
26337 // used.
26338 SmallVector<SDNode *, 4> Uses;
26339 unsigned ExtractedElements = 0;
26340 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
26341 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
26342 if (UI.getUse().getResNo() != InputVector.getResNo())
26343 return SDValue();
26344
26345 SDNode *Extract = *UI;
26346 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26347 return SDValue();
26348
26349 if (Extract->getValueType(0) != MVT::i32)
26350 return SDValue();
26351 if (!Extract->hasOneUse())
26352 return SDValue();
26353 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
26354 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
26355 return SDValue();
26356 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
26357 return SDValue();
26358
26359 // Record which element was extracted.
26360 ExtractedElements |=
26361 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
26362
26363 Uses.push_back(Extract);
26364 }
26365
26366 // If not all the elements were used, this may not be worthwhile.
26367 if (ExtractedElements != 15)
26368 return SDValue();
26369
26370 // Ok, we've now decided to do the transformation.
26371 // If 64-bit shifts are legal, use the extract-shift sequence,
26372 // otherwise bounce the vector off the cache.
26373 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26374 SDValue Vals[4];
26375
26376 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
26377 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
26378 auto &DL = DAG.getDataLayout();
26379 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
26380 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26381 DAG.getConstant(0, dl, VecIdxTy));
26382 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26383 DAG.getConstant(1, dl, VecIdxTy));
26384
26385 SDValue ShAmt = DAG.getConstant(
26386 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
26387 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
26388 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26389 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
26390 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
26391 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26392 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
26393 } else {
26394 // Store the value to a temporary stack slot.
26395 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
26396 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
26397 MachinePointerInfo(), false, false, 0);
26398
26399 EVT ElementType = InputVector.getValueType().getVectorElementType();
26400 unsigned EltSize = ElementType.getSizeInBits() / 8;
26401
26402 // Replace each use (extract) with a load of the appropriate element.
26403 for (unsigned i = 0; i < 4; ++i) {
26404 uint64_t Offset = EltSize * i;
26405 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26406 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
26407
26408 SDValue ScalarAddr =
26409 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
26410
26411 // Load the scalar.
26412 Vals[i] = DAG.getLoad(ElementType, dl, Ch,
26413 ScalarAddr, MachinePointerInfo(),
26414 false, false, false, 0);
26415
26416 }
26417 }
26418
26419 // Replace the extracts
26420 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
26421 UE = Uses.end(); UI != UE; ++UI) {
26422 SDNode *Extract = *UI;
26423
26424 SDValue Idx = Extract->getOperand(1);
26425 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
26426 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
26427 }
26428
26429 // The replacement was made in place; don't return anything.
26430 return SDValue();
26431 }
26432
26433 /// Do target-specific dag combines on SELECT and VSELECT nodes.
combineSelect(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)26434 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
26435 TargetLowering::DAGCombinerInfo &DCI,
26436 const X86Subtarget &Subtarget) {
26437 SDLoc DL(N);
26438 SDValue Cond = N->getOperand(0);
26439 // Get the LHS/RHS of the select.
26440 SDValue LHS = N->getOperand(1);
26441 SDValue RHS = N->getOperand(2);
26442 EVT VT = LHS.getValueType();
26443 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26444
26445 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
26446 // instructions match the semantics of the common C idiom x<y?x:y but not
26447 // x<=y?x:y, because of how they handle negative zero (which can be
26448 // ignored in unsafe-math mode).
26449 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
26450 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
26451 VT != MVT::f80 && VT != MVT::f128 &&
26452 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
26453 (Subtarget.hasSSE2() ||
26454 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
26455 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26456
26457 unsigned Opcode = 0;
26458 // Check for x CC y ? x : y.
26459 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26460 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26461 switch (CC) {
26462 default: break;
26463 case ISD::SETULT:
26464 // Converting this to a min would handle NaNs incorrectly, and swapping
26465 // the operands would cause it to handle comparisons between positive
26466 // and negative zero incorrectly.
26467 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26468 if (!DAG.getTarget().Options.UnsafeFPMath &&
26469 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26470 break;
26471 std::swap(LHS, RHS);
26472 }
26473 Opcode = X86ISD::FMIN;
26474 break;
26475 case ISD::SETOLE:
26476 // Converting this to a min would handle comparisons between positive
26477 // and negative zero incorrectly.
26478 if (!DAG.getTarget().Options.UnsafeFPMath &&
26479 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26480 break;
26481 Opcode = X86ISD::FMIN;
26482 break;
26483 case ISD::SETULE:
26484 // Converting this to a min would handle both negative zeros and NaNs
26485 // incorrectly, but we can swap the operands to fix both.
26486 std::swap(LHS, RHS);
26487 case ISD::SETOLT:
26488 case ISD::SETLT:
26489 case ISD::SETLE:
26490 Opcode = X86ISD::FMIN;
26491 break;
26492
26493 case ISD::SETOGE:
26494 // Converting this to a max would handle comparisons between positive
26495 // and negative zero incorrectly.
26496 if (!DAG.getTarget().Options.UnsafeFPMath &&
26497 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26498 break;
26499 Opcode = X86ISD::FMAX;
26500 break;
26501 case ISD::SETUGT:
26502 // Converting this to a max would handle NaNs incorrectly, and swapping
26503 // the operands would cause it to handle comparisons between positive
26504 // and negative zero incorrectly.
26505 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26506 if (!DAG.getTarget().Options.UnsafeFPMath &&
26507 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26508 break;
26509 std::swap(LHS, RHS);
26510 }
26511 Opcode = X86ISD::FMAX;
26512 break;
26513 case ISD::SETUGE:
26514 // Converting this to a max would handle both negative zeros and NaNs
26515 // incorrectly, but we can swap the operands to fix both.
26516 std::swap(LHS, RHS);
26517 case ISD::SETOGT:
26518 case ISD::SETGT:
26519 case ISD::SETGE:
26520 Opcode = X86ISD::FMAX;
26521 break;
26522 }
26523 // Check for x CC y ? y : x -- a min/max with reversed arms.
26524 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
26525 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
26526 switch (CC) {
26527 default: break;
26528 case ISD::SETOGE:
26529 // Converting this to a min would handle comparisons between positive
26530 // and negative zero incorrectly, and swapping the operands would
26531 // cause it to handle NaNs incorrectly.
26532 if (!DAG.getTarget().Options.UnsafeFPMath &&
26533 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
26534 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26535 break;
26536 std::swap(LHS, RHS);
26537 }
26538 Opcode = X86ISD::FMIN;
26539 break;
26540 case ISD::SETUGT:
26541 // Converting this to a min would handle NaNs incorrectly.
26542 if (!DAG.getTarget().Options.UnsafeFPMath &&
26543 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
26544 break;
26545 Opcode = X86ISD::FMIN;
26546 break;
26547 case ISD::SETUGE:
26548 // Converting this to a min would handle both negative zeros and NaNs
26549 // incorrectly, but we can swap the operands to fix both.
26550 std::swap(LHS, RHS);
26551 case ISD::SETOGT:
26552 case ISD::SETGT:
26553 case ISD::SETGE:
26554 Opcode = X86ISD::FMIN;
26555 break;
26556
26557 case ISD::SETULT:
26558 // Converting this to a max would handle NaNs incorrectly.
26559 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26560 break;
26561 Opcode = X86ISD::FMAX;
26562 break;
26563 case ISD::SETOLE:
26564 // Converting this to a max would handle comparisons between positive
26565 // and negative zero incorrectly, and swapping the operands would
26566 // cause it to handle NaNs incorrectly.
26567 if (!DAG.getTarget().Options.UnsafeFPMath &&
26568 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
26569 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26570 break;
26571 std::swap(LHS, RHS);
26572 }
26573 Opcode = X86ISD::FMAX;
26574 break;
26575 case ISD::SETULE:
26576 // Converting this to a max would handle both negative zeros and NaNs
26577 // incorrectly, but we can swap the operands to fix both.
26578 std::swap(LHS, RHS);
26579 case ISD::SETOLT:
26580 case ISD::SETLT:
26581 case ISD::SETLE:
26582 Opcode = X86ISD::FMAX;
26583 break;
26584 }
26585 }
26586
26587 if (Opcode)
26588 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
26589 }
26590
26591 EVT CondVT = Cond.getValueType();
26592 if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
26593 CondVT.getVectorElementType() == MVT::i1) {
26594 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
26595 // lowering on KNL. In this case we convert it to
26596 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
26597 // The same situation for all 128 and 256-bit vectors of i8 and i16.
26598 // Since SKX these selects have a proper lowering.
26599 EVT OpVT = LHS.getValueType();
26600 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
26601 (OpVT.getVectorElementType() == MVT::i8 ||
26602 OpVT.getVectorElementType() == MVT::i16) &&
26603 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
26604 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
26605 DCI.AddToWorklist(Cond.getNode());
26606 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
26607 }
26608 }
26609 // If this is a select between two integer constants, try to do some
26610 // optimizations.
26611 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
26612 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
26613 // Don't do this for crazy integer types.
26614 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
26615 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
26616 // so that TrueC (the true value) is larger than FalseC.
26617 bool NeedsCondInvert = false;
26618
26619 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
26620 // Efficiently invertible.
26621 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
26622 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
26623 isa<ConstantSDNode>(Cond.getOperand(1))))) {
26624 NeedsCondInvert = true;
26625 std::swap(TrueC, FalseC);
26626 }
26627
26628 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
26629 if (FalseC->getAPIntValue() == 0 &&
26630 TrueC->getAPIntValue().isPowerOf2()) {
26631 if (NeedsCondInvert) // Invert the condition if needed.
26632 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26633 DAG.getConstant(1, DL, Cond.getValueType()));
26634
26635 // Zero extend the condition if needed.
26636 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
26637
26638 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
26639 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
26640 DAG.getConstant(ShAmt, DL, MVT::i8));
26641 }
26642
26643 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
26644 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
26645 if (NeedsCondInvert) // Invert the condition if needed.
26646 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26647 DAG.getConstant(1, DL, Cond.getValueType()));
26648
26649 // Zero extend the condition if needed.
26650 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
26651 FalseC->getValueType(0), Cond);
26652 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26653 SDValue(FalseC, 0));
26654 }
26655
26656 // Optimize cases that will turn into an LEA instruction. This requires
26657 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
26658 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
26659 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
26660 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
26661
26662 bool isFastMultiplier = false;
26663 if (Diff < 10) {
26664 switch ((unsigned char)Diff) {
26665 default: break;
26666 case 1: // result = add base, cond
26667 case 2: // result = lea base( , cond*2)
26668 case 3: // result = lea base(cond, cond*2)
26669 case 4: // result = lea base( , cond*4)
26670 case 5: // result = lea base(cond, cond*4)
26671 case 8: // result = lea base( , cond*8)
26672 case 9: // result = lea base(cond, cond*8)
26673 isFastMultiplier = true;
26674 break;
26675 }
26676 }
26677
26678 if (isFastMultiplier) {
26679 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
26680 if (NeedsCondInvert) // Invert the condition if needed.
26681 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26682 DAG.getConstant(1, DL, Cond.getValueType()));
26683
26684 // Zero extend the condition if needed.
26685 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
26686 Cond);
26687 // Scale the condition by the difference.
26688 if (Diff != 1)
26689 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
26690 DAG.getConstant(Diff, DL,
26691 Cond.getValueType()));
26692
26693 // Add the base if non-zero.
26694 if (FalseC->getAPIntValue() != 0)
26695 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26696 SDValue(FalseC, 0));
26697 return Cond;
26698 }
26699 }
26700 }
26701 }
26702
26703 // Canonicalize max and min:
26704 // (x > y) ? x : y -> (x >= y) ? x : y
26705 // (x < y) ? x : y -> (x <= y) ? x : y
26706 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
26707 // the need for an extra compare
26708 // against zero. e.g.
26709 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
26710 // subl %esi, %edi
26711 // testl %edi, %edi
26712 // movl $0, %eax
26713 // cmovgl %edi, %eax
26714 // =>
26715 // xorl %eax, %eax
26716 // subl %esi, $edi
26717 // cmovsl %eax, %edi
26718 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
26719 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26720 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26721 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26722 switch (CC) {
26723 default: break;
26724 case ISD::SETLT:
26725 case ISD::SETGT: {
26726 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
26727 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
26728 Cond.getOperand(0), Cond.getOperand(1), NewCC);
26729 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
26730 }
26731 }
26732 }
26733
26734 // Early exit check
26735 if (!TLI.isTypeLegal(VT))
26736 return SDValue();
26737
26738 // Match VSELECTs into subs with unsigned saturation.
26739 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
26740 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
26741 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
26742 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
26743 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26744
26745 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
26746 // left side invert the predicate to simplify logic below.
26747 SDValue Other;
26748 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
26749 Other = RHS;
26750 CC = ISD::getSetCCInverse(CC, true);
26751 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
26752 Other = LHS;
26753 }
26754
26755 if (Other.getNode() && Other->getNumOperands() == 2 &&
26756 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
26757 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
26758 SDValue CondRHS = Cond->getOperand(1);
26759
26760 // Look for a general sub with unsigned saturation first.
26761 // x >= y ? x-y : 0 --> subus x, y
26762 // x > y ? x-y : 0 --> subus x, y
26763 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
26764 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
26765 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
26766
26767 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
26768 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
26769 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
26770 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
26771 // If the RHS is a constant we have to reverse the const
26772 // canonicalization.
26773 // x > C-1 ? x+-C : 0 --> subus x, C
26774 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
26775 CondRHSConst->getAPIntValue() ==
26776 (-OpRHSConst->getAPIntValue() - 1))
26777 return DAG.getNode(
26778 X86ISD::SUBUS, DL, VT, OpLHS,
26779 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
26780
26781 // Another special case: If C was a sign bit, the sub has been
26782 // canonicalized into a xor.
26783 // FIXME: Would it be better to use computeKnownBits to determine
26784 // whether it's safe to decanonicalize the xor?
26785 // x s< 0 ? x^C : 0 --> subus x, C
26786 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
26787 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
26788 OpRHSConst->getAPIntValue().isSignBit())
26789 // Note that we have to rebuild the RHS constant here to ensure we
26790 // don't rely on particular values of undef lanes.
26791 return DAG.getNode(
26792 X86ISD::SUBUS, DL, VT, OpLHS,
26793 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
26794 }
26795 }
26796 }
26797
26798 // Simplify vector selection if condition value type matches vselect
26799 // operand type
26800 if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
26801 assert(Cond.getValueType().isVector() &&
26802 "vector select expects a vector selector!");
26803
26804 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
26805 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
26806
26807 // Try invert the condition if true value is not all 1s and false value
26808 // is not all 0s.
26809 if (!TValIsAllOnes && !FValIsAllZeros &&
26810 // Check if the selector will be produced by CMPP*/PCMP*
26811 Cond.getOpcode() == ISD::SETCC &&
26812 // Check if SETCC has already been promoted
26813 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
26814 CondVT) {
26815 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
26816 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
26817
26818 if (TValIsAllZeros || FValIsAllOnes) {
26819 SDValue CC = Cond.getOperand(2);
26820 ISD::CondCode NewCC =
26821 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
26822 Cond.getOperand(0).getValueType().isInteger());
26823 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
26824 std::swap(LHS, RHS);
26825 TValIsAllOnes = FValIsAllOnes;
26826 FValIsAllZeros = TValIsAllZeros;
26827 }
26828 }
26829
26830 if (TValIsAllOnes || FValIsAllZeros) {
26831 SDValue Ret;
26832
26833 if (TValIsAllOnes && FValIsAllZeros)
26834 Ret = Cond;
26835 else if (TValIsAllOnes)
26836 Ret =
26837 DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
26838 else if (FValIsAllZeros)
26839 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
26840 DAG.getBitcast(CondVT, LHS));
26841
26842 return DAG.getBitcast(VT, Ret);
26843 }
26844 }
26845
26846 // If this is a *dynamic* select (non-constant condition) and we can match
26847 // this node with one of the variable blend instructions, restructure the
26848 // condition so that the blends can use the high bit of each element and use
26849 // SimplifyDemandedBits to simplify the condition operand.
26850 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
26851 !DCI.isBeforeLegalize() &&
26852 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
26853 unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
26854
26855 // Don't optimize vector selects that map to mask-registers.
26856 if (BitWidth == 1)
26857 return SDValue();
26858
26859 // We can only handle the cases where VSELECT is directly legal on the
26860 // subtarget. We custom lower VSELECT nodes with constant conditions and
26861 // this makes it hard to see whether a dynamic VSELECT will correctly
26862 // lower, so we both check the operation's status and explicitly handle the
26863 // cases where a *dynamic* blend will fail even though a constant-condition
26864 // blend could be custom lowered.
26865 // FIXME: We should find a better way to handle this class of problems.
26866 // Potentially, we should combine constant-condition vselect nodes
26867 // pre-legalization into shuffles and not mark as many types as custom
26868 // lowered.
26869 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
26870 return SDValue();
26871 // FIXME: We don't support i16-element blends currently. We could and
26872 // should support them by making *all* the bits in the condition be set
26873 // rather than just the high bit and using an i8-element blend.
26874 if (VT.getVectorElementType() == MVT::i16)
26875 return SDValue();
26876 // Dynamic blending was only available from SSE4.1 onward.
26877 if (VT.is128BitVector() && !Subtarget.hasSSE41())
26878 return SDValue();
26879 // Byte blends are only available in AVX2
26880 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
26881 return SDValue();
26882
26883 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
26884 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
26885
26886 APInt KnownZero, KnownOne;
26887 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
26888 DCI.isBeforeLegalizeOps());
26889 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
26890 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
26891 TLO)) {
26892 // If we changed the computation somewhere in the DAG, this change
26893 // will affect all users of Cond.
26894 // Make sure it is fine and update all the nodes so that we do not
26895 // use the generic VSELECT anymore. Otherwise, we may perform
26896 // wrong optimizations as we messed up with the actual expectation
26897 // for the vector boolean values.
26898 if (Cond != TLO.Old) {
26899 // Check all uses of that condition operand to check whether it will be
26900 // consumed by non-BLEND instructions, which may depend on all bits are
26901 // set properly.
26902 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26903 I != E; ++I)
26904 if (I->getOpcode() != ISD::VSELECT)
26905 // TODO: Add other opcodes eventually lowered into BLEND.
26906 return SDValue();
26907
26908 // Update all the users of the condition, before committing the change,
26909 // so that the VSELECT optimizations that expect the correct vector
26910 // boolean value will not be triggered.
26911 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26912 I != E; ++I)
26913 DAG.ReplaceAllUsesOfValueWith(
26914 SDValue(*I, 0),
26915 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
26916 Cond, I->getOperand(1), I->getOperand(2)));
26917 DCI.CommitTargetLoweringOpt(TLO);
26918 return SDValue();
26919 }
26920 // At this point, only Cond is changed. Change the condition
26921 // just for N to keep the opportunity to optimize all other
26922 // users their own way.
26923 DAG.ReplaceAllUsesOfValueWith(
26924 SDValue(N, 0),
26925 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
26926 TLO.New, N->getOperand(1), N->getOperand(2)));
26927 return SDValue();
26928 }
26929 }
26930
26931 return SDValue();
26932 }
26933
26934 /// Combine:
26935 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
26936 /// to:
26937 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
26938 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
26939 /// Note that this is only legal for some op/cc combinations.
combineSetCCAtomicArith(SDValue Cmp,X86::CondCode & CC,SelectionDAG & DAG)26940 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
26941 SelectionDAG &DAG) {
26942 // This combine only operates on CMP-like nodes.
26943 if (!(Cmp.getOpcode() == X86ISD::CMP ||
26944 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
26945 return SDValue();
26946
26947 // This only applies to variations of the common case:
26948 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
26949 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
26950 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
26951 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
26952 // Using the proper condcodes (see below), overflow is checked for.
26953
26954 // FIXME: We can generalize both constraints:
26955 // - XOR/OR/AND (if they were made to survive AtomicExpand)
26956 // - LHS != 1
26957 // if the result is compared.
26958
26959 SDValue CmpLHS = Cmp.getOperand(0);
26960 SDValue CmpRHS = Cmp.getOperand(1);
26961
26962 if (!CmpLHS.hasOneUse())
26963 return SDValue();
26964
26965 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
26966 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
26967 return SDValue();
26968
26969 const unsigned Opc = CmpLHS.getOpcode();
26970
26971 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
26972 return SDValue();
26973
26974 SDValue OpRHS = CmpLHS.getOperand(2);
26975 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
26976 if (!OpRHSC)
26977 return SDValue();
26978
26979 APInt Addend = OpRHSC->getAPIntValue();
26980 if (Opc == ISD::ATOMIC_LOAD_SUB)
26981 Addend = -Addend;
26982
26983 if (CC == X86::COND_S && Addend == 1)
26984 CC = X86::COND_LE;
26985 else if (CC == X86::COND_NS && Addend == 1)
26986 CC = X86::COND_G;
26987 else if (CC == X86::COND_G && Addend == -1)
26988 CC = X86::COND_GE;
26989 else if (CC == X86::COND_LE && Addend == -1)
26990 CC = X86::COND_L;
26991 else
26992 return SDValue();
26993
26994 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
26995 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
26996 DAG.getUNDEF(CmpLHS.getValueType()));
26997 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
26998 return LockOp;
26999 }
27000
27001 // Check whether a boolean test is testing a boolean value generated by
27002 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
27003 // code.
27004 //
27005 // Simplify the following patterns:
27006 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
27007 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
27008 // to (Op EFLAGS Cond)
27009 //
27010 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
27011 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
27012 // to (Op EFLAGS !Cond)
27013 //
27014 // where Op could be BRCOND or CMOV.
27015 //
checkBoolTestSetCCCombine(SDValue Cmp,X86::CondCode & CC)27016 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
27017 // This combine only operates on CMP-like nodes.
27018 if (!(Cmp.getOpcode() == X86ISD::CMP ||
27019 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
27020 return SDValue();
27021
27022 // Quit if not used as a boolean value.
27023 if (CC != X86::COND_E && CC != X86::COND_NE)
27024 return SDValue();
27025
27026 // Check CMP operands. One of them should be 0 or 1 and the other should be
27027 // an SetCC or extended from it.
27028 SDValue Op1 = Cmp.getOperand(0);
27029 SDValue Op2 = Cmp.getOperand(1);
27030
27031 SDValue SetCC;
27032 const ConstantSDNode* C = nullptr;
27033 bool needOppositeCond = (CC == X86::COND_E);
27034 bool checkAgainstTrue = false; // Is it a comparison against 1?
27035
27036 if ((C = dyn_cast<ConstantSDNode>(Op1)))
27037 SetCC = Op2;
27038 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
27039 SetCC = Op1;
27040 else // Quit if all operands are not constants.
27041 return SDValue();
27042
27043 if (C->getZExtValue() == 1) {
27044 needOppositeCond = !needOppositeCond;
27045 checkAgainstTrue = true;
27046 } else if (C->getZExtValue() != 0)
27047 // Quit if the constant is neither 0 or 1.
27048 return SDValue();
27049
27050 bool truncatedToBoolWithAnd = false;
27051 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
27052 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
27053 SetCC.getOpcode() == ISD::TRUNCATE ||
27054 SetCC.getOpcode() == ISD::AssertZext ||
27055 SetCC.getOpcode() == ISD::AND) {
27056 if (SetCC.getOpcode() == ISD::AND) {
27057 int OpIdx = -1;
27058 if (isOneConstant(SetCC.getOperand(0)))
27059 OpIdx = 1;
27060 if (isOneConstant(SetCC.getOperand(1)))
27061 OpIdx = 0;
27062 if (OpIdx < 0)
27063 break;
27064 SetCC = SetCC.getOperand(OpIdx);
27065 truncatedToBoolWithAnd = true;
27066 } else
27067 SetCC = SetCC.getOperand(0);
27068 }
27069
27070 switch (SetCC.getOpcode()) {
27071 case X86ISD::SETCC_CARRY:
27072 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
27073 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
27074 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
27075 // truncated to i1 using 'and'.
27076 if (checkAgainstTrue && !truncatedToBoolWithAnd)
27077 break;
27078 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
27079 "Invalid use of SETCC_CARRY!");
27080 // FALL THROUGH
27081 case X86ISD::SETCC:
27082 // Set the condition code or opposite one if necessary.
27083 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
27084 if (needOppositeCond)
27085 CC = X86::GetOppositeBranchCondition(CC);
27086 return SetCC.getOperand(1);
27087 case X86ISD::CMOV: {
27088 // Check whether false/true value has canonical one, i.e. 0 or 1.
27089 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
27090 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
27091 // Quit if true value is not a constant.
27092 if (!TVal)
27093 return SDValue();
27094 // Quit if false value is not a constant.
27095 if (!FVal) {
27096 SDValue Op = SetCC.getOperand(0);
27097 // Skip 'zext' or 'trunc' node.
27098 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
27099 Op.getOpcode() == ISD::TRUNCATE)
27100 Op = Op.getOperand(0);
27101 // A special case for rdrand/rdseed, where 0 is set if false cond is
27102 // found.
27103 if ((Op.getOpcode() != X86ISD::RDRAND &&
27104 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
27105 return SDValue();
27106 }
27107 // Quit if false value is not the constant 0 or 1.
27108 bool FValIsFalse = true;
27109 if (FVal && FVal->getZExtValue() != 0) {
27110 if (FVal->getZExtValue() != 1)
27111 return SDValue();
27112 // If FVal is 1, opposite cond is needed.
27113 needOppositeCond = !needOppositeCond;
27114 FValIsFalse = false;
27115 }
27116 // Quit if TVal is not the constant opposite of FVal.
27117 if (FValIsFalse && TVal->getZExtValue() != 1)
27118 return SDValue();
27119 if (!FValIsFalse && TVal->getZExtValue() != 0)
27120 return SDValue();
27121 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
27122 if (needOppositeCond)
27123 CC = X86::GetOppositeBranchCondition(CC);
27124 return SetCC.getOperand(3);
27125 }
27126 }
27127
27128 return SDValue();
27129 }
27130
27131 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
27132 /// Match:
27133 /// (X86or (X86setcc) (X86setcc))
27134 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
checkBoolTestAndOrSetCCCombine(SDValue Cond,X86::CondCode & CC0,X86::CondCode & CC1,SDValue & Flags,bool & isAnd)27135 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
27136 X86::CondCode &CC1, SDValue &Flags,
27137 bool &isAnd) {
27138 if (Cond->getOpcode() == X86ISD::CMP) {
27139 if (!isNullConstant(Cond->getOperand(1)))
27140 return false;
27141
27142 Cond = Cond->getOperand(0);
27143 }
27144
27145 isAnd = false;
27146
27147 SDValue SetCC0, SetCC1;
27148 switch (Cond->getOpcode()) {
27149 default: return false;
27150 case ISD::AND:
27151 case X86ISD::AND:
27152 isAnd = true;
27153 // fallthru
27154 case ISD::OR:
27155 case X86ISD::OR:
27156 SetCC0 = Cond->getOperand(0);
27157 SetCC1 = Cond->getOperand(1);
27158 break;
27159 };
27160
27161 // Make sure we have SETCC nodes, using the same flags value.
27162 if (SetCC0.getOpcode() != X86ISD::SETCC ||
27163 SetCC1.getOpcode() != X86ISD::SETCC ||
27164 SetCC0->getOperand(1) != SetCC1->getOperand(1))
27165 return false;
27166
27167 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
27168 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
27169 Flags = SetCC0->getOperand(1);
27170 return true;
27171 }
27172
27173 /// Optimize an EFLAGS definition used according to the condition code \p CC
27174 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
27175 /// uses of chain values.
combineSetCCEFLAGS(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG)27176 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
27177 SelectionDAG &DAG) {
27178 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
27179 return R;
27180 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
27181 }
27182
27183 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
combineCMov(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27184 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
27185 TargetLowering::DAGCombinerInfo &DCI,
27186 const X86Subtarget &Subtarget) {
27187 SDLoc DL(N);
27188
27189 // If the flag operand isn't dead, don't touch this CMOV.
27190 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
27191 return SDValue();
27192
27193 SDValue FalseOp = N->getOperand(0);
27194 SDValue TrueOp = N->getOperand(1);
27195 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
27196 SDValue Cond = N->getOperand(3);
27197
27198 if (CC == X86::COND_E || CC == X86::COND_NE) {
27199 switch (Cond.getOpcode()) {
27200 default: break;
27201 case X86ISD::BSR:
27202 case X86ISD::BSF:
27203 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
27204 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
27205 return (CC == X86::COND_E) ? FalseOp : TrueOp;
27206 }
27207 }
27208
27209 // Try to simplify the EFLAGS and condition code operands.
27210 // We can't always do this as FCMOV only supports a subset of X86 cond.
27211 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
27212 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
27213 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
27214 Flags};
27215 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27216 }
27217 }
27218
27219 // If this is a select between two integer constants, try to do some
27220 // optimizations. Note that the operands are ordered the opposite of SELECT
27221 // operands.
27222 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
27223 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
27224 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
27225 // larger than FalseC (the false value).
27226 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
27227 CC = X86::GetOppositeBranchCondition(CC);
27228 std::swap(TrueC, FalseC);
27229 std::swap(TrueOp, FalseOp);
27230 }
27231
27232 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
27233 // This is efficient for any integer data type (including i8/i16) and
27234 // shift amount.
27235 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
27236 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27237 DAG.getConstant(CC, DL, MVT::i8), Cond);
27238
27239 // Zero extend the condition if needed.
27240 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
27241
27242 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
27243 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
27244 DAG.getConstant(ShAmt, DL, MVT::i8));
27245 if (N->getNumValues() == 2) // Dead flag value?
27246 return DCI.CombineTo(N, Cond, SDValue());
27247 return Cond;
27248 }
27249
27250 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
27251 // for any integer data type, including i8/i16.
27252 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
27253 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27254 DAG.getConstant(CC, DL, MVT::i8), Cond);
27255
27256 // Zero extend the condition if needed.
27257 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
27258 FalseC->getValueType(0), Cond);
27259 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27260 SDValue(FalseC, 0));
27261
27262 if (N->getNumValues() == 2) // Dead flag value?
27263 return DCI.CombineTo(N, Cond, SDValue());
27264 return Cond;
27265 }
27266
27267 // Optimize cases that will turn into an LEA instruction. This requires
27268 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
27269 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
27270 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
27271 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
27272
27273 bool isFastMultiplier = false;
27274 if (Diff < 10) {
27275 switch ((unsigned char)Diff) {
27276 default: break;
27277 case 1: // result = add base, cond
27278 case 2: // result = lea base( , cond*2)
27279 case 3: // result = lea base(cond, cond*2)
27280 case 4: // result = lea base( , cond*4)
27281 case 5: // result = lea base(cond, cond*4)
27282 case 8: // result = lea base( , cond*8)
27283 case 9: // result = lea base(cond, cond*8)
27284 isFastMultiplier = true;
27285 break;
27286 }
27287 }
27288
27289 if (isFastMultiplier) {
27290 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
27291 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27292 DAG.getConstant(CC, DL, MVT::i8), Cond);
27293 // Zero extend the condition if needed.
27294 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
27295 Cond);
27296 // Scale the condition by the difference.
27297 if (Diff != 1)
27298 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
27299 DAG.getConstant(Diff, DL, Cond.getValueType()));
27300
27301 // Add the base if non-zero.
27302 if (FalseC->getAPIntValue() != 0)
27303 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27304 SDValue(FalseC, 0));
27305 if (N->getNumValues() == 2) // Dead flag value?
27306 return DCI.CombineTo(N, Cond, SDValue());
27307 return Cond;
27308 }
27309 }
27310 }
27311 }
27312
27313 // Handle these cases:
27314 // (select (x != c), e, c) -> select (x != c), e, x),
27315 // (select (x == c), c, e) -> select (x == c), x, e)
27316 // where the c is an integer constant, and the "select" is the combination
27317 // of CMOV and CMP.
27318 //
27319 // The rationale for this change is that the conditional-move from a constant
27320 // needs two instructions, however, conditional-move from a register needs
27321 // only one instruction.
27322 //
27323 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
27324 // some instruction-combining opportunities. This opt needs to be
27325 // postponed as late as possible.
27326 //
27327 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
27328 // the DCI.xxxx conditions are provided to postpone the optimization as
27329 // late as possible.
27330
27331 ConstantSDNode *CmpAgainst = nullptr;
27332 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
27333 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
27334 !isa<ConstantSDNode>(Cond.getOperand(0))) {
27335
27336 if (CC == X86::COND_NE &&
27337 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
27338 CC = X86::GetOppositeBranchCondition(CC);
27339 std::swap(TrueOp, FalseOp);
27340 }
27341
27342 if (CC == X86::COND_E &&
27343 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
27344 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
27345 DAG.getConstant(CC, DL, MVT::i8), Cond };
27346 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
27347 }
27348 }
27349 }
27350
27351 // Fold and/or of setcc's to double CMOV:
27352 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
27353 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
27354 //
27355 // This combine lets us generate:
27356 // cmovcc1 (jcc1 if we don't have CMOV)
27357 // cmovcc2 (same)
27358 // instead of:
27359 // setcc1
27360 // setcc2
27361 // and/or
27362 // cmovne (jne if we don't have CMOV)
27363 // When we can't use the CMOV instruction, it might increase branch
27364 // mispredicts.
27365 // When we can use CMOV, or when there is no mispredict, this improves
27366 // throughput and reduces register pressure.
27367 //
27368 if (CC == X86::COND_NE) {
27369 SDValue Flags;
27370 X86::CondCode CC0, CC1;
27371 bool isAndSetCC;
27372 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
27373 if (isAndSetCC) {
27374 std::swap(FalseOp, TrueOp);
27375 CC0 = X86::GetOppositeBranchCondition(CC0);
27376 CC1 = X86::GetOppositeBranchCondition(CC1);
27377 }
27378
27379 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
27380 Flags};
27381 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
27382 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
27383 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27384 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
27385 return CMOV;
27386 }
27387 }
27388
27389 return SDValue();
27390 }
27391
27392 /// Different mul shrinking modes.
27393 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
27394
canReduceVMulWidth(SDNode * N,SelectionDAG & DAG,ShrinkMode & Mode)27395 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
27396 EVT VT = N->getOperand(0).getValueType();
27397 if (VT.getScalarSizeInBits() != 32)
27398 return false;
27399
27400 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
27401 unsigned SignBits[2] = {1, 1};
27402 bool IsPositive[2] = {false, false};
27403 for (unsigned i = 0; i < 2; i++) {
27404 SDValue Opd = N->getOperand(i);
27405
27406 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
27407 // compute signbits for it separately.
27408 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
27409 // For anyextend, it is safe to assume an appropriate number of leading
27410 // sign/zero bits.
27411 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
27412 SignBits[i] = 25;
27413 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
27414 MVT::i16)
27415 SignBits[i] = 17;
27416 else
27417 return false;
27418 IsPositive[i] = true;
27419 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
27420 // All the operands of BUILD_VECTOR need to be int constant.
27421 // Find the smallest value range which all the operands belong to.
27422 SignBits[i] = 32;
27423 IsPositive[i] = true;
27424 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
27425 if (SubOp.isUndef())
27426 continue;
27427 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
27428 if (!CN)
27429 return false;
27430 APInt IntVal = CN->getAPIntValue();
27431 if (IntVal.isNegative())
27432 IsPositive[i] = false;
27433 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
27434 }
27435 } else {
27436 SignBits[i] = DAG.ComputeNumSignBits(Opd);
27437 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
27438 IsPositive[i] = true;
27439 }
27440 }
27441
27442 bool AllPositive = IsPositive[0] && IsPositive[1];
27443 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
27444 // When ranges are from -128 ~ 127, use MULS8 mode.
27445 if (MinSignBits >= 25)
27446 Mode = MULS8;
27447 // When ranges are from 0 ~ 255, use MULU8 mode.
27448 else if (AllPositive && MinSignBits >= 24)
27449 Mode = MULU8;
27450 // When ranges are from -32768 ~ 32767, use MULS16 mode.
27451 else if (MinSignBits >= 17)
27452 Mode = MULS16;
27453 // When ranges are from 0 ~ 65535, use MULU16 mode.
27454 else if (AllPositive && MinSignBits >= 16)
27455 Mode = MULU16;
27456 else
27457 return false;
27458 return true;
27459 }
27460
27461 /// When the operands of vector mul are extended from smaller size values,
27462 /// like i8 and i16, the type of mul may be shrinked to generate more
27463 /// efficient code. Two typical patterns are handled:
27464 /// Pattern1:
27465 /// %2 = sext/zext <N x i8> %1 to <N x i32>
27466 /// %4 = sext/zext <N x i8> %3 to <N x i32>
27467 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27468 /// %5 = mul <N x i32> %2, %4
27469 ///
27470 /// Pattern2:
27471 /// %2 = zext/sext <N x i16> %1 to <N x i32>
27472 /// %4 = zext/sext <N x i16> %3 to <N x i32>
27473 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27474 /// %5 = mul <N x i32> %2, %4
27475 ///
27476 /// There are four mul shrinking modes:
27477 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
27478 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
27479 /// generate pmullw+sext32 for it (MULS8 mode).
27480 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
27481 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
27482 /// generate pmullw+zext32 for it (MULU8 mode).
27483 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
27484 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
27485 /// generate pmullw+pmulhw for it (MULS16 mode).
27486 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
27487 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
27488 /// generate pmullw+pmulhuw for it (MULU16 mode).
reduceVMULWidth(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)27489 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
27490 const X86Subtarget &Subtarget) {
27491 // pmulld is supported since SSE41. It is better to use pmulld
27492 // instead of pmullw+pmulhw.
27493 if (Subtarget.hasSSE41())
27494 return SDValue();
27495
27496 ShrinkMode Mode;
27497 if (!canReduceVMulWidth(N, DAG, Mode))
27498 return SDValue();
27499
27500 SDLoc DL(N);
27501 SDValue N0 = N->getOperand(0);
27502 SDValue N1 = N->getOperand(1);
27503 EVT VT = N->getOperand(0).getValueType();
27504 unsigned RegSize = 128;
27505 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
27506 EVT ReducedVT =
27507 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
27508 // Shrink the operands of mul.
27509 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
27510 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
27511
27512 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
27513 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
27514 // lower part is needed.
27515 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
27516 if (Mode == MULU8 || Mode == MULS8) {
27517 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
27518 DL, VT, MulLo);
27519 } else {
27520 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27521 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
27522 // the higher part is also needed.
27523 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27524 ReducedVT, NewN0, NewN1);
27525
27526 // Repack the lower part and higher part result of mul into a wider
27527 // result.
27528 // Generate shuffle functioning as punpcklwd.
27529 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
27530 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27531 ShuffleMask[2 * i] = i;
27532 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
27533 }
27534 SDValue ResLo =
27535 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27536 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
27537 // Generate shuffle functioning as punpckhwd.
27538 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27539 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
27540 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
27541 }
27542 SDValue ResHi =
27543 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27544 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
27545 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
27546 }
27547 } else {
27548 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
27549 // to legalize the mul explicitly because implicit legalization for type
27550 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
27551 // instructions which will not exist when we explicitly legalize it by
27552 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
27553 // <4 x i16> undef).
27554 //
27555 // Legalize the operands of mul.
27556 SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
27557 DAG.getUNDEF(ReducedVT));
27558 Ops[0] = NewN0;
27559 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27560 Ops[0] = NewN1;
27561 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27562
27563 if (Mode == MULU8 || Mode == MULS8) {
27564 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
27565 // part is needed.
27566 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27567
27568 // convert the type of mul result to VT.
27569 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27570 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
27571 : ISD::SIGN_EXTEND_VECTOR_INREG,
27572 DL, ResVT, Mul);
27573 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27574 DAG.getIntPtrConstant(0, DL));
27575 } else {
27576 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
27577 // MULU16/MULS16, both parts are needed.
27578 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27579 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27580 OpsVT, NewN0, NewN1);
27581
27582 // Repack the lower part and higher part result of mul into a wider
27583 // result. Make sure the type of mul result is VT.
27584 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27585 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
27586 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
27587 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27588 DAG.getIntPtrConstant(0, DL));
27589 }
27590 }
27591 }
27592
27593 /// Optimize a single multiply with constant into two operations in order to
27594 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
combineMul(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27595 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
27596 TargetLowering::DAGCombinerInfo &DCI,
27597 const X86Subtarget &Subtarget) {
27598 EVT VT = N->getValueType(0);
27599 if (DCI.isBeforeLegalize() && VT.isVector())
27600 return reduceVMULWidth(N, DAG, Subtarget);
27601
27602 // An imul is usually smaller than the alternative sequence.
27603 if (DAG.getMachineFunction().getFunction()->optForMinSize())
27604 return SDValue();
27605
27606 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
27607 return SDValue();
27608
27609 if (VT != MVT::i64 && VT != MVT::i32)
27610 return SDValue();
27611
27612 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
27613 if (!C)
27614 return SDValue();
27615 uint64_t MulAmt = C->getZExtValue();
27616 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
27617 return SDValue();
27618
27619 uint64_t MulAmt1 = 0;
27620 uint64_t MulAmt2 = 0;
27621 if ((MulAmt % 9) == 0) {
27622 MulAmt1 = 9;
27623 MulAmt2 = MulAmt / 9;
27624 } else if ((MulAmt % 5) == 0) {
27625 MulAmt1 = 5;
27626 MulAmt2 = MulAmt / 5;
27627 } else if ((MulAmt % 3) == 0) {
27628 MulAmt1 = 3;
27629 MulAmt2 = MulAmt / 3;
27630 }
27631
27632 SDLoc DL(N);
27633 SDValue NewMul;
27634 if (MulAmt2 &&
27635 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
27636
27637 if (isPowerOf2_64(MulAmt2) &&
27638 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
27639 // If second multiplifer is pow2, issue it first. We want the multiply by
27640 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
27641 // is an add.
27642 std::swap(MulAmt1, MulAmt2);
27643
27644 if (isPowerOf2_64(MulAmt1))
27645 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27646 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
27647 else
27648 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
27649 DAG.getConstant(MulAmt1, DL, VT));
27650
27651 if (isPowerOf2_64(MulAmt2))
27652 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
27653 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
27654 else
27655 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
27656 DAG.getConstant(MulAmt2, DL, VT));
27657 }
27658
27659 if (!NewMul) {
27660 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
27661 && "Both cases that could cause potential overflows should have "
27662 "already been handled.");
27663 if (isPowerOf2_64(MulAmt - 1))
27664 // (mul x, 2^N + 1) => (add (shl x, N), x)
27665 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
27666 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27667 DAG.getConstant(Log2_64(MulAmt - 1), DL,
27668 MVT::i8)));
27669
27670 else if (isPowerOf2_64(MulAmt + 1))
27671 // (mul x, 2^N - 1) => (sub (shl x, N), x)
27672 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
27673 N->getOperand(0),
27674 DAG.getConstant(Log2_64(MulAmt + 1),
27675 DL, MVT::i8)), N->getOperand(0));
27676 }
27677
27678 if (NewMul)
27679 // Do not add new nodes to DAG combiner worklist.
27680 DCI.CombineTo(N, NewMul, false);
27681
27682 return SDValue();
27683 }
27684
combineShiftLeft(SDNode * N,SelectionDAG & DAG)27685 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
27686 SDValue N0 = N->getOperand(0);
27687 SDValue N1 = N->getOperand(1);
27688 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
27689 EVT VT = N0.getValueType();
27690
27691 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
27692 // since the result of setcc_c is all zero's or all ones.
27693 if (VT.isInteger() && !VT.isVector() &&
27694 N1C && N0.getOpcode() == ISD::AND &&
27695 N0.getOperand(1).getOpcode() == ISD::Constant) {
27696 SDValue N00 = N0.getOperand(0);
27697 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
27698 const APInt &ShAmt = N1C->getAPIntValue();
27699 Mask = Mask.shl(ShAmt);
27700 bool MaskOK = false;
27701 // We can handle cases concerning bit-widening nodes containing setcc_c if
27702 // we carefully interrogate the mask to make sure we are semantics
27703 // preserving.
27704 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
27705 // of the underlying setcc_c operation if the setcc_c was zero extended.
27706 // Consider the following example:
27707 // zext(setcc_c) -> i32 0x0000FFFF
27708 // c1 -> i32 0x0000FFFF
27709 // c2 -> i32 0x00000001
27710 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
27711 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
27712 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
27713 MaskOK = true;
27714 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
27715 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27716 MaskOK = true;
27717 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
27718 N00.getOpcode() == ISD::ANY_EXTEND) &&
27719 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27720 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
27721 }
27722 if (MaskOK && Mask != 0) {
27723 SDLoc DL(N);
27724 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
27725 }
27726 }
27727
27728 // Hardware support for vector shifts is sparse which makes us scalarize the
27729 // vector operations in many cases. Also, on sandybridge ADD is faster than
27730 // shl.
27731 // (shl V, 1) -> add V,V
27732 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
27733 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
27734 assert(N0.getValueType().isVector() && "Invalid vector shift type");
27735 // We shift all of the values by one. In many cases we do not have
27736 // hardware support for this operation. This is better expressed as an ADD
27737 // of two values.
27738 if (N1SplatC->getAPIntValue() == 1)
27739 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
27740 }
27741
27742 return SDValue();
27743 }
27744
combineShiftRightAlgebraic(SDNode * N,SelectionDAG & DAG)27745 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
27746 SDValue N0 = N->getOperand(0);
27747 SDValue N1 = N->getOperand(1);
27748 EVT VT = N0.getValueType();
27749 unsigned Size = VT.getSizeInBits();
27750
27751 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
27752 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
27753 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
27754 // depending on sign of (SarConst - [56,48,32,24,16])
27755
27756 // sexts in X86 are MOVs. The MOVs have the same code size
27757 // as above SHIFTs (only SHIFT on 1 has lower code size).
27758 // However the MOVs have 2 advantages to a SHIFT:
27759 // 1. MOVs can write to a register that differs from source
27760 // 2. MOVs accept memory operands
27761
27762 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
27763 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
27764 N0.getOperand(1).getOpcode() != ISD::Constant)
27765 return SDValue();
27766
27767 SDValue N00 = N0.getOperand(0);
27768 SDValue N01 = N0.getOperand(1);
27769 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
27770 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
27771 EVT CVT = N1.getValueType();
27772
27773 if (SarConst.isNegative())
27774 return SDValue();
27775
27776 for (MVT SVT : MVT::integer_valuetypes()) {
27777 unsigned ShiftSize = SVT.getSizeInBits();
27778 // skipping types without corresponding sext/zext and
27779 // ShlConst that is not one of [56,48,32,24,16]
27780 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
27781 continue;
27782 SDLoc DL(N);
27783 SDValue NN =
27784 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
27785 SarConst = SarConst - (Size - ShiftSize);
27786 if (SarConst == 0)
27787 return NN;
27788 else if (SarConst.isNegative())
27789 return DAG.getNode(ISD::SHL, DL, VT, NN,
27790 DAG.getConstant(-SarConst, DL, CVT));
27791 else
27792 return DAG.getNode(ISD::SRA, DL, VT, NN,
27793 DAG.getConstant(SarConst, DL, CVT));
27794 }
27795 return SDValue();
27796 }
27797
27798 /// \brief Returns a vector of 0s if the node in input is a vector logical
27799 /// shift by a constant amount which is known to be bigger than or equal
27800 /// to the vector element size in bits.
performShiftToAllZeros(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)27801 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
27802 const X86Subtarget &Subtarget) {
27803 EVT VT = N->getValueType(0);
27804
27805 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
27806 (!Subtarget.hasInt256() ||
27807 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
27808 return SDValue();
27809
27810 SDValue Amt = N->getOperand(1);
27811 SDLoc DL(N);
27812 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
27813 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
27814 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
27815 unsigned MaxAmount =
27816 VT.getSimpleVT().getVectorElementType().getSizeInBits();
27817
27818 // SSE2/AVX2 logical shifts always return a vector of 0s
27819 // if the shift amount is bigger than or equal to
27820 // the element size. The constant shift amount will be
27821 // encoded as a 8-bit immediate.
27822 if (ShiftAmt.trunc(8).uge(MaxAmount))
27823 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
27824 }
27825
27826 return SDValue();
27827 }
27828
combineShift(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27829 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
27830 TargetLowering::DAGCombinerInfo &DCI,
27831 const X86Subtarget &Subtarget) {
27832 if (N->getOpcode() == ISD::SHL)
27833 if (SDValue V = combineShiftLeft(N, DAG))
27834 return V;
27835
27836 if (N->getOpcode() == ISD::SRA)
27837 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
27838 return V;
27839
27840 // Try to fold this logical shift into a zero vector.
27841 if (N->getOpcode() != ISD::SRA)
27842 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
27843 return V;
27844
27845 return SDValue();
27846 }
27847
27848 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
27849 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
27850 /// OR -> CMPNEQSS.
combineCompareEqual(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27851 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
27852 TargetLowering::DAGCombinerInfo &DCI,
27853 const X86Subtarget &Subtarget) {
27854 unsigned opcode;
27855
27856 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
27857 // we're requiring SSE2 for both.
27858 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
27859 SDValue N0 = N->getOperand(0);
27860 SDValue N1 = N->getOperand(1);
27861 SDValue CMP0 = N0->getOperand(1);
27862 SDValue CMP1 = N1->getOperand(1);
27863 SDLoc DL(N);
27864
27865 // The SETCCs should both refer to the same CMP.
27866 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
27867 return SDValue();
27868
27869 SDValue CMP00 = CMP0->getOperand(0);
27870 SDValue CMP01 = CMP0->getOperand(1);
27871 EVT VT = CMP00.getValueType();
27872
27873 if (VT == MVT::f32 || VT == MVT::f64) {
27874 bool ExpectingFlags = false;
27875 // Check for any users that want flags:
27876 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
27877 !ExpectingFlags && UI != UE; ++UI)
27878 switch (UI->getOpcode()) {
27879 default:
27880 case ISD::BR_CC:
27881 case ISD::BRCOND:
27882 case ISD::SELECT:
27883 ExpectingFlags = true;
27884 break;
27885 case ISD::CopyToReg:
27886 case ISD::SIGN_EXTEND:
27887 case ISD::ZERO_EXTEND:
27888 case ISD::ANY_EXTEND:
27889 break;
27890 }
27891
27892 if (!ExpectingFlags) {
27893 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
27894 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
27895
27896 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
27897 X86::CondCode tmp = cc0;
27898 cc0 = cc1;
27899 cc1 = tmp;
27900 }
27901
27902 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
27903 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
27904 // FIXME: need symbolic constants for these magic numbers.
27905 // See X86ATTInstPrinter.cpp:printSSECC().
27906 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
27907 if (Subtarget.hasAVX512()) {
27908 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
27909 CMP01,
27910 DAG.getConstant(x86cc, DL, MVT::i8));
27911 if (N->getValueType(0) != MVT::i1)
27912 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
27913 FSetCC);
27914 return FSetCC;
27915 }
27916 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
27917 CMP00.getValueType(), CMP00, CMP01,
27918 DAG.getConstant(x86cc, DL,
27919 MVT::i8));
27920
27921 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
27922 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
27923
27924 if (is64BitFP && !Subtarget.is64Bit()) {
27925 // On a 32-bit target, we cannot bitcast the 64-bit float to a
27926 // 64-bit integer, since that's not a legal type. Since
27927 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
27928 // bits, but can do this little dance to extract the lowest 32 bits
27929 // and work with those going forward.
27930 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
27931 OnesOrZeroesF);
27932 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
27933 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
27934 Vector32, DAG.getIntPtrConstant(0, DL));
27935 IntVT = MVT::i32;
27936 }
27937
27938 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
27939 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
27940 DAG.getConstant(1, DL, IntVT));
27941 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27942 ANDed);
27943 return OneBitOfTruth;
27944 }
27945 }
27946 }
27947 }
27948 return SDValue();
27949 }
27950
27951 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
combineANDXORWithAllOnesIntoANDNP(SDNode * N,SelectionDAG & DAG)27952 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
27953 assert(N->getOpcode() == ISD::AND);
27954
27955 EVT VT = N->getValueType(0);
27956 SDValue N0 = N->getOperand(0);
27957 SDValue N1 = N->getOperand(1);
27958 SDLoc DL(N);
27959
27960 if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
27961 VT != MVT::v8i64 && VT != MVT::v16i32 &&
27962 VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
27963 return SDValue();
27964
27965 // Canonicalize XOR to the left.
27966 if (N1.getOpcode() == ISD::XOR)
27967 std::swap(N0, N1);
27968
27969 if (N0.getOpcode() != ISD::XOR)
27970 return SDValue();
27971
27972 SDValue N00 = N0->getOperand(0);
27973 SDValue N01 = N0->getOperand(1);
27974
27975 N01 = peekThroughBitcasts(N01);
27976
27977 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
27978 // insert_subvector building a 256-bit AllOnes vector.
27979 if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
27980 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
27981 return SDValue();
27982
27983 SDValue V1 = N01->getOperand(0);
27984 SDValue V2 = N01->getOperand(1);
27985 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
27986 !V1.getOperand(0).isUndef() ||
27987 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
27988 !ISD::isBuildVectorAllOnes(V2.getNode()))
27989 return SDValue();
27990 }
27991 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
27992 }
27993
27994 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
27995 // register. In most cases we actually compare or select YMM-sized registers
27996 // and mixing the two types creates horrible code. This method optimizes
27997 // some of the transition sequences.
WidenMaskArithmetic(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27998 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
27999 TargetLowering::DAGCombinerInfo &DCI,
28000 const X86Subtarget &Subtarget) {
28001 EVT VT = N->getValueType(0);
28002 if (!VT.is256BitVector())
28003 return SDValue();
28004
28005 assert((N->getOpcode() == ISD::ANY_EXTEND ||
28006 N->getOpcode() == ISD::ZERO_EXTEND ||
28007 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
28008
28009 SDValue Narrow = N->getOperand(0);
28010 EVT NarrowVT = Narrow->getValueType(0);
28011 if (!NarrowVT.is128BitVector())
28012 return SDValue();
28013
28014 if (Narrow->getOpcode() != ISD::XOR &&
28015 Narrow->getOpcode() != ISD::AND &&
28016 Narrow->getOpcode() != ISD::OR)
28017 return SDValue();
28018
28019 SDValue N0 = Narrow->getOperand(0);
28020 SDValue N1 = Narrow->getOperand(1);
28021 SDLoc DL(Narrow);
28022
28023 // The Left side has to be a trunc.
28024 if (N0.getOpcode() != ISD::TRUNCATE)
28025 return SDValue();
28026
28027 // The type of the truncated inputs.
28028 EVT WideVT = N0->getOperand(0)->getValueType(0);
28029 if (WideVT != VT)
28030 return SDValue();
28031
28032 // The right side has to be a 'trunc' or a constant vector.
28033 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
28034 ConstantSDNode *RHSConstSplat = nullptr;
28035 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
28036 RHSConstSplat = RHSBV->getConstantSplatNode();
28037 if (!RHSTrunc && !RHSConstSplat)
28038 return SDValue();
28039
28040 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28041
28042 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
28043 return SDValue();
28044
28045 // Set N0 and N1 to hold the inputs to the new wide operation.
28046 N0 = N0->getOperand(0);
28047 if (RHSConstSplat) {
28048 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
28049 SDValue(RHSConstSplat, 0));
28050 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
28051 } else if (RHSTrunc) {
28052 N1 = N1->getOperand(0);
28053 }
28054
28055 // Generate the wide operation.
28056 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
28057 unsigned Opcode = N->getOpcode();
28058 switch (Opcode) {
28059 case ISD::ANY_EXTEND:
28060 return Op;
28061 case ISD::ZERO_EXTEND: {
28062 unsigned InBits = NarrowVT.getScalarSizeInBits();
28063 APInt Mask = APInt::getAllOnesValue(InBits);
28064 Mask = Mask.zext(VT.getScalarSizeInBits());
28065 return DAG.getNode(ISD::AND, DL, VT,
28066 Op, DAG.getConstant(Mask, DL, VT));
28067 }
28068 case ISD::SIGN_EXTEND:
28069 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
28070 Op, DAG.getValueType(NarrowVT));
28071 default:
28072 llvm_unreachable("Unexpected opcode");
28073 }
28074 }
28075
combineVectorZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28076 static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
28077 TargetLowering::DAGCombinerInfo &DCI,
28078 const X86Subtarget &Subtarget) {
28079 SDValue N0 = N->getOperand(0);
28080 SDValue N1 = N->getOperand(1);
28081 SDLoc DL(N);
28082
28083 // A vector zext_in_reg may be represented as a shuffle,
28084 // feeding into a bitcast (this represents anyext) feeding into
28085 // an and with a mask.
28086 // We'd like to try to combine that into a shuffle with zero
28087 // plus a bitcast, removing the and.
28088 if (N0.getOpcode() != ISD::BITCAST ||
28089 N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
28090 return SDValue();
28091
28092 // The other side of the AND should be a splat of 2^C, where C
28093 // is the number of bits in the source type.
28094 N1 = peekThroughBitcasts(N1);
28095 if (N1.getOpcode() != ISD::BUILD_VECTOR)
28096 return SDValue();
28097 BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
28098
28099 ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
28100 EVT SrcType = Shuffle->getValueType(0);
28101
28102 // We expect a single-source shuffle
28103 if (!Shuffle->getOperand(1)->isUndef())
28104 return SDValue();
28105
28106 unsigned SrcSize = SrcType.getScalarSizeInBits();
28107 unsigned NumElems = SrcType.getVectorNumElements();
28108
28109 APInt SplatValue, SplatUndef;
28110 unsigned SplatBitSize;
28111 bool HasAnyUndefs;
28112 if (!Vector->isConstantSplat(SplatValue, SplatUndef,
28113 SplatBitSize, HasAnyUndefs))
28114 return SDValue();
28115
28116 unsigned ResSize = N1.getValueType().getScalarSizeInBits();
28117 // Make sure the splat matches the mask we expect
28118 if (SplatBitSize > ResSize ||
28119 (SplatValue + 1).exactLogBase2() != (int)SrcSize)
28120 return SDValue();
28121
28122 // Make sure the input and output size make sense
28123 if (SrcSize >= ResSize || ResSize % SrcSize)
28124 return SDValue();
28125
28126 // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
28127 // The number of u's between each two values depends on the ratio between
28128 // the source and dest type.
28129 unsigned ZextRatio = ResSize / SrcSize;
28130 bool IsZext = true;
28131 for (unsigned i = 0; i != NumElems; ++i) {
28132 if (i % ZextRatio) {
28133 if (Shuffle->getMaskElt(i) > 0) {
28134 // Expected undef
28135 IsZext = false;
28136 break;
28137 }
28138 } else {
28139 if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
28140 // Expected element number
28141 IsZext = false;
28142 break;
28143 }
28144 }
28145 }
28146
28147 if (!IsZext)
28148 return SDValue();
28149
28150 // Ok, perform the transformation - replace the shuffle with
28151 // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
28152 // (instead of undef) where the k elements come from the zero vector.
28153 SmallVector<int, 8> Mask;
28154 for (unsigned i = 0; i != NumElems; ++i)
28155 if (i % ZextRatio)
28156 Mask.push_back(NumElems);
28157 else
28158 Mask.push_back(i / ZextRatio);
28159
28160 SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
28161 Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
28162 return DAG.getBitcast(N0.getValueType(), NewShuffle);
28163 }
28164
28165 /// If both input operands of a logic op are being cast from floating point
28166 /// types, try to convert this into a floating point logic node to avoid
28167 /// unnecessary moves from SSE to integer registers.
convertIntLogicToFPLogic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)28168 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
28169 const X86Subtarget &Subtarget) {
28170 unsigned FPOpcode = ISD::DELETED_NODE;
28171 if (N->getOpcode() == ISD::AND)
28172 FPOpcode = X86ISD::FAND;
28173 else if (N->getOpcode() == ISD::OR)
28174 FPOpcode = X86ISD::FOR;
28175 else if (N->getOpcode() == ISD::XOR)
28176 FPOpcode = X86ISD::FXOR;
28177
28178 assert(FPOpcode != ISD::DELETED_NODE &&
28179 "Unexpected input node for FP logic conversion");
28180
28181 EVT VT = N->getValueType(0);
28182 SDValue N0 = N->getOperand(0);
28183 SDValue N1 = N->getOperand(1);
28184 SDLoc DL(N);
28185 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
28186 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
28187 (Subtarget.hasSSE2() && VT == MVT::i64))) {
28188 SDValue N00 = N0.getOperand(0);
28189 SDValue N10 = N1.getOperand(0);
28190 EVT N00Type = N00.getValueType();
28191 EVT N10Type = N10.getValueType();
28192 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
28193 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
28194 return DAG.getBitcast(VT, FPLogic);
28195 }
28196 }
28197 return SDValue();
28198 }
28199
28200 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
28201 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
28202 /// eliminate loading the vector constant mask value. This relies on the fact
28203 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
combinePCMPAnd1(SDNode * N,SelectionDAG & DAG)28204 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
28205 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
28206 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
28207
28208 // TODO: Use AssertSext to mark any nodes that have the property of producing
28209 // all-ones or all-zeros. Then check for that node rather than particular
28210 // opcodes.
28211 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
28212 return SDValue();
28213
28214 // The existence of the PCMP node guarantees that we have the required SSE2 or
28215 // AVX2 for a shift of this vector type, but there is no vector shift by
28216 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
28217 // masked compare nodes, so they should not make it here.
28218 EVT VT0 = Op0.getValueType();
28219 EVT VT1 = Op1.getValueType();
28220 unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
28221 if (VT0 != VT1 || EltBitWidth == 8)
28222 return SDValue();
28223
28224 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
28225
28226 APInt SplatVal;
28227 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
28228 return SDValue();
28229
28230 SDLoc DL(N);
28231 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
28232 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
28233 return DAG.getBitcast(N->getValueType(0), Shift);
28234 }
28235
combineAnd(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28236 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
28237 TargetLowering::DAGCombinerInfo &DCI,
28238 const X86Subtarget &Subtarget) {
28239 if (DCI.isBeforeLegalizeOps())
28240 return SDValue();
28241
28242 if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
28243 return Zext;
28244
28245 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28246 return R;
28247
28248 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28249 return FPLogic;
28250
28251 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
28252 return R;
28253
28254 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
28255 return ShiftRight;
28256
28257 EVT VT = N->getValueType(0);
28258 SDValue N0 = N->getOperand(0);
28259 SDValue N1 = N->getOperand(1);
28260 SDLoc DL(N);
28261
28262 // Create BEXTR instructions
28263 // BEXTR is ((X >> imm) & (2**size-1))
28264 if (VT != MVT::i32 && VT != MVT::i64)
28265 return SDValue();
28266
28267 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
28268 return SDValue();
28269 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
28270 return SDValue();
28271
28272 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
28273 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
28274 if (MaskNode && ShiftNode) {
28275 uint64_t Mask = MaskNode->getZExtValue();
28276 uint64_t Shift = ShiftNode->getZExtValue();
28277 if (isMask_64(Mask)) {
28278 uint64_t MaskSize = countPopulation(Mask);
28279 if (Shift + MaskSize <= VT.getSizeInBits())
28280 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
28281 DAG.getConstant(Shift | (MaskSize << 8), DL,
28282 VT));
28283 }
28284 }
28285 return SDValue();
28286 }
28287
28288 // Try to fold:
28289 // (or (and (m, y), (pandn m, x)))
28290 // into:
28291 // (vselect m, x, y)
28292 // As a special case, try to fold:
28293 // (or (and (m, (sub 0, x)), (pandn m, x)))
28294 // into:
28295 // (sub (xor X, M), M)
combineLogicBlendIntoPBLENDV(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)28296 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
28297 const X86Subtarget &Subtarget) {
28298 assert(N->getOpcode() == ISD::OR);
28299
28300 SDValue N0 = N->getOperand(0);
28301 SDValue N1 = N->getOperand(1);
28302 EVT VT = N->getValueType(0);
28303
28304 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
28305 return SDValue();
28306 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
28307
28308 // Canonicalize pandn to RHS
28309 if (N0.getOpcode() == X86ISD::ANDNP)
28310 std::swap(N0, N1);
28311
28312 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
28313 return SDValue();
28314
28315 SDValue Mask = N1.getOperand(0);
28316 SDValue X = N1.getOperand(1);
28317 SDValue Y;
28318 if (N0.getOperand(0) == Mask)
28319 Y = N0.getOperand(1);
28320 if (N0.getOperand(1) == Mask)
28321 Y = N0.getOperand(0);
28322
28323 // Check to see if the mask appeared in both the AND and ANDNP.
28324 if (!Y.getNode())
28325 return SDValue();
28326
28327 // Validate that X, Y, and Mask are bitcasts, and see through them.
28328 Mask = peekThroughBitcasts(Mask);
28329 X = peekThroughBitcasts(X);
28330 Y = peekThroughBitcasts(Y);
28331
28332 EVT MaskVT = Mask.getValueType();
28333
28334 // Validate that the Mask operand is a vector sra node.
28335 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
28336 // there is no psrai.b
28337 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
28338 unsigned SraAmt = ~0;
28339 if (Mask.getOpcode() == ISD::SRA) {
28340 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
28341 if (auto *AmtConst = AmtBV->getConstantSplatNode())
28342 SraAmt = AmtConst->getZExtValue();
28343 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
28344 SDValue SraC = Mask.getOperand(1);
28345 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
28346 }
28347 if ((SraAmt + 1) != EltBits)
28348 return SDValue();
28349
28350 SDLoc DL(N);
28351
28352 // Try to match:
28353 // (or (and (M, (sub 0, X)), (pandn M, X)))
28354 // which is a special case of vselect:
28355 // (vselect M, (sub 0, X), X)
28356 // Per:
28357 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
28358 // We know that, if fNegate is 0 or 1:
28359 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
28360 //
28361 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
28362 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
28363 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
28364 // This lets us transform our vselect to:
28365 // (add (xor X, M), (and M, 1))
28366 // And further to:
28367 // (sub (xor X, M), M)
28368 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
28369 auto IsNegV = [](SDNode *N, SDValue V) {
28370 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
28371 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
28372 };
28373 SDValue V;
28374 if (IsNegV(Y.getNode(), X))
28375 V = X;
28376 else if (IsNegV(X.getNode(), Y))
28377 V = Y;
28378
28379 if (V) {
28380 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
28381 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
28382 SDValue SubOp2 = Mask;
28383
28384 // If the negate was on the false side of the select, then
28385 // the operands of the SUB need to be swapped. PR 27251.
28386 // This is because the pattern being matched above is
28387 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
28388 // but if the pattern matched was
28389 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
28390 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
28391 // pattern also needs to be a negation of the replacement pattern above.
28392 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
28393 // sub accomplishes the negation of the replacement pattern.
28394 if (V == Y)
28395 std::swap(SubOp1, SubOp2);
28396
28397 return DAG.getBitcast(VT,
28398 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
28399 }
28400 }
28401
28402 // PBLENDVB is only available on SSE 4.1.
28403 if (!Subtarget.hasSSE41())
28404 return SDValue();
28405
28406 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
28407
28408 X = DAG.getBitcast(BlendVT, X);
28409 Y = DAG.getBitcast(BlendVT, Y);
28410 Mask = DAG.getBitcast(BlendVT, Mask);
28411 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
28412 return DAG.getBitcast(VT, Mask);
28413 }
28414
combineOr(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28415 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
28416 TargetLowering::DAGCombinerInfo &DCI,
28417 const X86Subtarget &Subtarget) {
28418 if (DCI.isBeforeLegalizeOps())
28419 return SDValue();
28420
28421 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28422 return R;
28423
28424 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28425 return FPLogic;
28426
28427 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
28428 return R;
28429
28430 SDValue N0 = N->getOperand(0);
28431 SDValue N1 = N->getOperand(1);
28432 EVT VT = N->getValueType(0);
28433
28434 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
28435 return SDValue();
28436
28437 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
28438 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
28439
28440 // SHLD/SHRD instructions have lower register pressure, but on some
28441 // platforms they have higher latency than the equivalent
28442 // series of shifts/or that would otherwise be generated.
28443 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
28444 // have higher latencies and we are not optimizing for size.
28445 if (!OptForSize && Subtarget.isSHLDSlow())
28446 return SDValue();
28447
28448 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
28449 std::swap(N0, N1);
28450 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
28451 return SDValue();
28452 if (!N0.hasOneUse() || !N1.hasOneUse())
28453 return SDValue();
28454
28455 SDValue ShAmt0 = N0.getOperand(1);
28456 if (ShAmt0.getValueType() != MVT::i8)
28457 return SDValue();
28458 SDValue ShAmt1 = N1.getOperand(1);
28459 if (ShAmt1.getValueType() != MVT::i8)
28460 return SDValue();
28461 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
28462 ShAmt0 = ShAmt0.getOperand(0);
28463 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
28464 ShAmt1 = ShAmt1.getOperand(0);
28465
28466 SDLoc DL(N);
28467 unsigned Opc = X86ISD::SHLD;
28468 SDValue Op0 = N0.getOperand(0);
28469 SDValue Op1 = N1.getOperand(0);
28470 if (ShAmt0.getOpcode() == ISD::SUB) {
28471 Opc = X86ISD::SHRD;
28472 std::swap(Op0, Op1);
28473 std::swap(ShAmt0, ShAmt1);
28474 }
28475
28476 unsigned Bits = VT.getSizeInBits();
28477 if (ShAmt1.getOpcode() == ISD::SUB) {
28478 SDValue Sum = ShAmt1.getOperand(0);
28479 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
28480 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
28481 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
28482 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
28483 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
28484 return DAG.getNode(Opc, DL, VT,
28485 Op0, Op1,
28486 DAG.getNode(ISD::TRUNCATE, DL,
28487 MVT::i8, ShAmt0));
28488 }
28489 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
28490 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
28491 if (ShAmt0C &&
28492 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
28493 return DAG.getNode(Opc, DL, VT,
28494 N0.getOperand(0), N1.getOperand(0),
28495 DAG.getNode(ISD::TRUNCATE, DL,
28496 MVT::i8, ShAmt0));
28497 }
28498
28499 return SDValue();
28500 }
28501
28502 // Generate NEG and CMOV for integer abs.
combineIntegerAbs(SDNode * N,SelectionDAG & DAG)28503 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
28504 EVT VT = N->getValueType(0);
28505
28506 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28507 // 8-bit integer abs to NEG and CMOV.
28508 if (VT.isInteger() && VT.getSizeInBits() == 8)
28509 return SDValue();
28510
28511 SDValue N0 = N->getOperand(0);
28512 SDValue N1 = N->getOperand(1);
28513 SDLoc DL(N);
28514
28515 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
28516 // and change it to SUB and CMOV.
28517 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
28518 N0.getOpcode() == ISD::ADD &&
28519 N0.getOperand(1) == N1 &&
28520 N1.getOpcode() == ISD::SRA &&
28521 N1.getOperand(0) == N0.getOperand(0))
28522 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
28523 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
28524 // Generate SUB & CMOV.
28525 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28526 DAG.getConstant(0, DL, VT), N0.getOperand(0));
28527
28528 SDValue Ops[] = { N0.getOperand(0), Neg,
28529 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
28530 SDValue(Neg.getNode(), 1) };
28531 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
28532 }
28533 return SDValue();
28534 }
28535
28536 /// Try to turn tests against the signbit in the form of:
28537 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
28538 /// into:
28539 /// SETGT(X, -1)
foldXorTruncShiftIntoCmp(SDNode * N,SelectionDAG & DAG)28540 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
28541 // This is only worth doing if the output type is i8 or i1.
28542 EVT ResultType = N->getValueType(0);
28543 if (ResultType != MVT::i8 && ResultType != MVT::i1)
28544 return SDValue();
28545
28546 SDValue N0 = N->getOperand(0);
28547 SDValue N1 = N->getOperand(1);
28548
28549 // We should be performing an xor against a truncated shift.
28550 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
28551 return SDValue();
28552
28553 // Make sure we are performing an xor against one.
28554 if (!isOneConstant(N1))
28555 return SDValue();
28556
28557 // SetCC on x86 zero extends so only act on this if it's a logical shift.
28558 SDValue Shift = N0.getOperand(0);
28559 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
28560 return SDValue();
28561
28562 // Make sure we are truncating from one of i16, i32 or i64.
28563 EVT ShiftTy = Shift.getValueType();
28564 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
28565 return SDValue();
28566
28567 // Make sure the shift amount extracts the sign bit.
28568 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
28569 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
28570 return SDValue();
28571
28572 // Create a greater-than comparison against -1.
28573 // N.B. Using SETGE against 0 works but we want a canonical looking
28574 // comparison, using SETGT matches up with what TranslateX86CC.
28575 SDLoc DL(N);
28576 SDValue ShiftOp = Shift.getOperand(0);
28577 EVT ShiftOpTy = ShiftOp.getValueType();
28578 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28579 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
28580 *DAG.getContext(), ResultType);
28581 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
28582 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
28583 if (SetCCResultType != ResultType)
28584 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
28585 return Cond;
28586 }
28587
28588 /// Turn vector tests of the signbit in the form of:
28589 /// xor (sra X, elt_size(X)-1), -1
28590 /// into:
28591 /// pcmpgt X, -1
28592 ///
28593 /// This should be called before type legalization because the pattern may not
28594 /// persist after that.
foldVectorXorShiftIntoCmp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)28595 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
28596 const X86Subtarget &Subtarget) {
28597 EVT VT = N->getValueType(0);
28598 if (!VT.isSimple())
28599 return SDValue();
28600
28601 switch (VT.getSimpleVT().SimpleTy) {
28602 default: return SDValue();
28603 case MVT::v16i8:
28604 case MVT::v8i16:
28605 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
28606 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
28607 case MVT::v32i8:
28608 case MVT::v16i16:
28609 case MVT::v8i32:
28610 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
28611 }
28612
28613 // There must be a shift right algebraic before the xor, and the xor must be a
28614 // 'not' operation.
28615 SDValue Shift = N->getOperand(0);
28616 SDValue Ones = N->getOperand(1);
28617 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
28618 !ISD::isBuildVectorAllOnes(Ones.getNode()))
28619 return SDValue();
28620
28621 // The shift should be smearing the sign bit across each vector element.
28622 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
28623 if (!ShiftBV)
28624 return SDValue();
28625
28626 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
28627 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
28628 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
28629 return SDValue();
28630
28631 // Create a greater-than comparison against -1. We don't use the more obvious
28632 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
28633 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
28634 }
28635
combineXor(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28636 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
28637 TargetLowering::DAGCombinerInfo &DCI,
28638 const X86Subtarget &Subtarget) {
28639 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
28640 return Cmp;
28641
28642 if (DCI.isBeforeLegalizeOps())
28643 return SDValue();
28644
28645 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
28646 return RV;
28647
28648 if (Subtarget.hasCMov())
28649 if (SDValue RV = combineIntegerAbs(N, DAG))
28650 return RV;
28651
28652 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28653 return FPLogic;
28654
28655 return SDValue();
28656 }
28657
28658 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
28659 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
28660 /// X86ISD::AVG instruction.
detectAVGPattern(SDValue In,EVT VT,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)28661 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
28662 const X86Subtarget &Subtarget,
28663 const SDLoc &DL) {
28664 if (!VT.isVector() || !VT.isSimple())
28665 return SDValue();
28666 EVT InVT = In.getValueType();
28667 unsigned NumElems = VT.getVectorNumElements();
28668
28669 EVT ScalarVT = VT.getVectorElementType();
28670 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
28671 isPowerOf2_32(NumElems)))
28672 return SDValue();
28673
28674 // InScalarVT is the intermediate type in AVG pattern and it should be greater
28675 // than the original input type (i8/i16).
28676 EVT InScalarVT = InVT.getVectorElementType();
28677 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
28678 return SDValue();
28679
28680 if (!Subtarget.hasSSE2())
28681 return SDValue();
28682 if (Subtarget.hasAVX512()) {
28683 if (VT.getSizeInBits() > 512)
28684 return SDValue();
28685 } else if (Subtarget.hasAVX2()) {
28686 if (VT.getSizeInBits() > 256)
28687 return SDValue();
28688 } else {
28689 if (VT.getSizeInBits() > 128)
28690 return SDValue();
28691 }
28692
28693 // Detect the following pattern:
28694 //
28695 // %1 = zext <N x i8> %a to <N x i32>
28696 // %2 = zext <N x i8> %b to <N x i32>
28697 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
28698 // %4 = add nuw nsw <N x i32> %3, %2
28699 // %5 = lshr <N x i32> %N, <i32 1 x N>
28700 // %6 = trunc <N x i32> %5 to <N x i8>
28701 //
28702 // In AVX512, the last instruction can also be a trunc store.
28703
28704 if (In.getOpcode() != ISD::SRL)
28705 return SDValue();
28706
28707 // A lambda checking the given SDValue is a constant vector and each element
28708 // is in the range [Min, Max].
28709 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
28710 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
28711 if (!BV || !BV->isConstant())
28712 return false;
28713 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
28714 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
28715 if (!C)
28716 return false;
28717 uint64_t Val = C->getZExtValue();
28718 if (Val < Min || Val > Max)
28719 return false;
28720 }
28721 return true;
28722 };
28723
28724 // Check if each element of the vector is left-shifted by one.
28725 auto LHS = In.getOperand(0);
28726 auto RHS = In.getOperand(1);
28727 if (!IsConstVectorInRange(RHS, 1, 1))
28728 return SDValue();
28729 if (LHS.getOpcode() != ISD::ADD)
28730 return SDValue();
28731
28732 // Detect a pattern of a + b + 1 where the order doesn't matter.
28733 SDValue Operands[3];
28734 Operands[0] = LHS.getOperand(0);
28735 Operands[1] = LHS.getOperand(1);
28736
28737 // Take care of the case when one of the operands is a constant vector whose
28738 // element is in the range [1, 256].
28739 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
28740 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
28741 Operands[0].getOperand(0).getValueType() == VT) {
28742 // The pattern is detected. Subtract one from the constant vector, then
28743 // demote it and emit X86ISD::AVG instruction.
28744 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
28745 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
28746 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
28747 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28748 Operands[1]);
28749 }
28750
28751 if (Operands[0].getOpcode() == ISD::ADD)
28752 std::swap(Operands[0], Operands[1]);
28753 else if (Operands[1].getOpcode() != ISD::ADD)
28754 return SDValue();
28755 Operands[2] = Operands[1].getOperand(0);
28756 Operands[1] = Operands[1].getOperand(1);
28757
28758 // Now we have three operands of two additions. Check that one of them is a
28759 // constant vector with ones, and the other two are promoted from i8/i16.
28760 for (int i = 0; i < 3; ++i) {
28761 if (!IsConstVectorInRange(Operands[i], 1, 1))
28762 continue;
28763 std::swap(Operands[i], Operands[2]);
28764
28765 // Check if Operands[0] and Operands[1] are results of type promotion.
28766 for (int j = 0; j < 2; ++j)
28767 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
28768 Operands[j].getOperand(0).getValueType() != VT)
28769 return SDValue();
28770
28771 // The pattern is detected, emit X86ISD::AVG instruction.
28772 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28773 Operands[1].getOperand(0));
28774 }
28775
28776 return SDValue();
28777 }
28778
combineLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28779 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
28780 TargetLowering::DAGCombinerInfo &DCI,
28781 const X86Subtarget &Subtarget) {
28782 LoadSDNode *Ld = cast<LoadSDNode>(N);
28783 EVT RegVT = Ld->getValueType(0);
28784 EVT MemVT = Ld->getMemoryVT();
28785 SDLoc dl(Ld);
28786 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28787
28788 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
28789 // into two 16-byte operations.
28790 ISD::LoadExtType Ext = Ld->getExtensionType();
28791 bool Fast;
28792 unsigned AddressSpace = Ld->getAddressSpace();
28793 unsigned Alignment = Ld->getAlignment();
28794 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
28795 Ext == ISD::NON_EXTLOAD &&
28796 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
28797 AddressSpace, Alignment, &Fast) && !Fast) {
28798 unsigned NumElems = RegVT.getVectorNumElements();
28799 if (NumElems < 2)
28800 return SDValue();
28801
28802 SDValue Ptr = Ld->getBasePtr();
28803
28804 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
28805 NumElems/2);
28806 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
28807 Ld->getPointerInfo(), Ld->isVolatile(),
28808 Ld->isNonTemporal(), Ld->isInvariant(),
28809 Alignment);
28810
28811 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
28812 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
28813 Ld->getPointerInfo(), Ld->isVolatile(),
28814 Ld->isNonTemporal(), Ld->isInvariant(),
28815 std::min(16U, Alignment));
28816 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
28817 Load1.getValue(1),
28818 Load2.getValue(1));
28819
28820 SDValue NewVec = DAG.getUNDEF(RegVT);
28821 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
28822 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
28823 return DCI.CombineTo(N, NewVec, TF, true);
28824 }
28825
28826 return SDValue();
28827 }
28828
28829 /// If V is a build vector of boolean constants and exactly one of those
28830 /// constants is true, return the operand index of that true element.
28831 /// Otherwise, return -1.
getOneTrueElt(SDValue V)28832 static int getOneTrueElt(SDValue V) {
28833 // This needs to be a build vector of booleans.
28834 // TODO: Checking for the i1 type matches the IR definition for the mask,
28835 // but the mask check could be loosened to i8 or other types. That might
28836 // also require checking more than 'allOnesValue'; eg, the x86 HW
28837 // instructions only require that the MSB is set for each mask element.
28838 // The ISD::MSTORE comments/definition do not specify how the mask operand
28839 // is formatted.
28840 auto *BV = dyn_cast<BuildVectorSDNode>(V);
28841 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
28842 return -1;
28843
28844 int TrueIndex = -1;
28845 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
28846 for (unsigned i = 0; i < NumElts; ++i) {
28847 const SDValue &Op = BV->getOperand(i);
28848 if (Op.isUndef())
28849 continue;
28850 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
28851 if (!ConstNode)
28852 return -1;
28853 if (ConstNode->getAPIntValue().isAllOnesValue()) {
28854 // If we already found a one, this is too many.
28855 if (TrueIndex >= 0)
28856 return -1;
28857 TrueIndex = i;
28858 }
28859 }
28860 return TrueIndex;
28861 }
28862
28863 /// Given a masked memory load/store operation, return true if it has one mask
28864 /// bit set. If it has one mask bit set, then also return the memory address of
28865 /// the scalar element to load/store, the vector index to insert/extract that
28866 /// scalar element, and the alignment for the scalar memory access.
getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode * MaskedOp,SelectionDAG & DAG,SDValue & Addr,SDValue & Index,unsigned & Alignment)28867 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
28868 SelectionDAG &DAG, SDValue &Addr,
28869 SDValue &Index, unsigned &Alignment) {
28870 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
28871 if (TrueMaskElt < 0)
28872 return false;
28873
28874 // Get the address of the one scalar element that is specified by the mask
28875 // using the appropriate offset from the base pointer.
28876 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
28877 Addr = MaskedOp->getBasePtr();
28878 if (TrueMaskElt != 0) {
28879 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
28880 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
28881 }
28882
28883 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
28884 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
28885 return true;
28886 }
28887
28888 /// If exactly one element of the mask is set for a non-extending masked load,
28889 /// it is a scalar load and vector insert.
28890 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
28891 /// mask have already been optimized in IR, so we don't bother with those here.
28892 static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)28893 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28894 TargetLowering::DAGCombinerInfo &DCI) {
28895 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
28896 // However, some target hooks may need to be added to know when the transform
28897 // is profitable. Endianness would also have to be considered.
28898
28899 SDValue Addr, VecIndex;
28900 unsigned Alignment;
28901 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
28902 return SDValue();
28903
28904 // Load the one scalar element that is specified by the mask using the
28905 // appropriate offset from the base pointer.
28906 SDLoc DL(ML);
28907 EVT VT = ML->getValueType(0);
28908 EVT EltVT = VT.getVectorElementType();
28909 SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
28910 ML->getPointerInfo(), ML->isVolatile(),
28911 ML->isNonTemporal(), ML->isInvariant(), Alignment);
28912
28913 // Insert the loaded element into the appropriate place in the vector.
28914 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
28915 Load, VecIndex);
28916 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
28917 }
28918
28919 static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)28920 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28921 TargetLowering::DAGCombinerInfo &DCI) {
28922 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
28923 return SDValue();
28924
28925 SDLoc DL(ML);
28926 EVT VT = ML->getValueType(0);
28927
28928 // If we are loading the first and last elements of a vector, it is safe and
28929 // always faster to load the whole vector. Replace the masked load with a
28930 // vector load and select.
28931 unsigned NumElts = VT.getVectorNumElements();
28932 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
28933 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
28934 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
28935 if (LoadFirstElt && LoadLastElt) {
28936 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28937 ML->getMemOperand());
28938 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
28939 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
28940 }
28941
28942 // Convert a masked load with a constant mask into a masked load and a select.
28943 // This allows the select operation to use a faster kind of select instruction
28944 // (for example, vblendvps -> vblendps).
28945
28946 // Don't try this if the pass-through operand is already undefined. That would
28947 // cause an infinite loop because that's what we're about to create.
28948 if (ML->getSrc0().isUndef())
28949 return SDValue();
28950
28951 // The new masked load has an undef pass-through operand. The select uses the
28952 // original pass-through operand.
28953 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28954 ML->getMask(), DAG.getUNDEF(VT),
28955 ML->getMemoryVT(), ML->getMemOperand(),
28956 ML->getExtensionType());
28957 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
28958
28959 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
28960 }
28961
combineMaskedLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28962 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
28963 TargetLowering::DAGCombinerInfo &DCI,
28964 const X86Subtarget &Subtarget) {
28965 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
28966 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
28967 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
28968 return ScalarLoad;
28969 // TODO: Do some AVX512 subsets benefit from this transform?
28970 if (!Subtarget.hasAVX512())
28971 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
28972 return Blend;
28973 }
28974
28975 if (Mld->getExtensionType() != ISD::SEXTLOAD)
28976 return SDValue();
28977
28978 // Resolve extending loads.
28979 EVT VT = Mld->getValueType(0);
28980 unsigned NumElems = VT.getVectorNumElements();
28981 EVT LdVT = Mld->getMemoryVT();
28982 SDLoc dl(Mld);
28983
28984 assert(LdVT != VT && "Cannot extend to the same type");
28985 unsigned ToSz = VT.getVectorElementType().getSizeInBits();
28986 unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
28987 // From/To sizes and ElemCount must be pow of two.
28988 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
28989 "Unexpected size for extending masked load");
28990
28991 unsigned SizeRatio = ToSz / FromSz;
28992 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
28993
28994 // Create a type on which we perform the shuffle.
28995 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
28996 LdVT.getScalarType(), NumElems*SizeRatio);
28997 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
28998
28999 // Convert Src0 value.
29000 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
29001 if (!Mld->getSrc0().isUndef()) {
29002 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29003 for (unsigned i = 0; i != NumElems; ++i)
29004 ShuffleVec[i] = i * SizeRatio;
29005
29006 // Can't shuffle using an illegal type.
29007 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29008 "WideVecVT should be legal");
29009 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
29010 DAG.getUNDEF(WideVecVT), ShuffleVec);
29011 }
29012 // Prepare the new mask.
29013 SDValue NewMask;
29014 SDValue Mask = Mld->getMask();
29015 if (Mask.getValueType() == VT) {
29016 // Mask and original value have the same type.
29017 NewMask = DAG.getBitcast(WideVecVT, Mask);
29018 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29019 for (unsigned i = 0; i != NumElems; ++i)
29020 ShuffleVec[i] = i * SizeRatio;
29021 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
29022 ShuffleVec[i] = NumElems * SizeRatio;
29023 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29024 DAG.getConstant(0, dl, WideVecVT),
29025 ShuffleVec);
29026 } else {
29027 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29028 unsigned WidenNumElts = NumElems*SizeRatio;
29029 unsigned MaskNumElts = VT.getVectorNumElements();
29030 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
29031 WidenNumElts);
29032
29033 unsigned NumConcat = WidenNumElts / MaskNumElts;
29034 SmallVector<SDValue, 16> Ops(NumConcat);
29035 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29036 Ops[0] = Mask;
29037 for (unsigned i = 1; i != NumConcat; ++i)
29038 Ops[i] = ZeroVal;
29039
29040 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29041 }
29042
29043 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
29044 Mld->getBasePtr(), NewMask, WideSrc0,
29045 Mld->getMemoryVT(), Mld->getMemOperand(),
29046 ISD::NON_EXTLOAD);
29047 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
29048 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
29049 }
29050
29051 /// If exactly one element of the mask is set for a non-truncating masked store,
29052 /// it is a vector extract and scalar store.
29053 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
29054 /// mask have already been optimized in IR, so we don't bother with those here.
reduceMaskedStoreToScalarStore(MaskedStoreSDNode * MS,SelectionDAG & DAG)29055 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
29056 SelectionDAG &DAG) {
29057 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
29058 // However, some target hooks may need to be added to know when the transform
29059 // is profitable. Endianness would also have to be considered.
29060
29061 SDValue Addr, VecIndex;
29062 unsigned Alignment;
29063 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
29064 return SDValue();
29065
29066 // Extract the one scalar element that is actually being stored.
29067 SDLoc DL(MS);
29068 EVT VT = MS->getValue().getValueType();
29069 EVT EltVT = VT.getVectorElementType();
29070 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
29071 MS->getValue(), VecIndex);
29072
29073 // Store that element at the appropriate offset from the base pointer.
29074 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
29075 MS->isVolatile(), MS->isNonTemporal(), Alignment);
29076 }
29077
combineMaskedStore(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29078 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
29079 const X86Subtarget &Subtarget) {
29080 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
29081 if (!Mst->isTruncatingStore())
29082 return reduceMaskedStoreToScalarStore(Mst, DAG);
29083
29084 // Resolve truncating stores.
29085 EVT VT = Mst->getValue().getValueType();
29086 unsigned NumElems = VT.getVectorNumElements();
29087 EVT StVT = Mst->getMemoryVT();
29088 SDLoc dl(Mst);
29089
29090 assert(StVT != VT && "Cannot truncate to the same type");
29091 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29092 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29093
29094 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29095
29096 // The truncating store is legal in some cases. For example
29097 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29098 // are designated for truncate store.
29099 // In this case we don't need any further transformations.
29100 if (TLI.isTruncStoreLegal(VT, StVT))
29101 return SDValue();
29102
29103 // From/To sizes and ElemCount must be pow of two.
29104 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
29105 "Unexpected size for truncating masked store");
29106 // We are going to use the original vector elt for storing.
29107 // Accumulated smaller vector elements must be a multiple of the store size.
29108 assert (((NumElems * FromSz) % ToSz) == 0 &&
29109 "Unexpected ratio for truncating masked store");
29110
29111 unsigned SizeRatio = FromSz / ToSz;
29112 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29113
29114 // Create a type on which we perform the shuffle.
29115 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29116 StVT.getScalarType(), NumElems*SizeRatio);
29117
29118 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29119
29120 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
29121 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29122 for (unsigned i = 0; i != NumElems; ++i)
29123 ShuffleVec[i] = i * SizeRatio;
29124
29125 // Can't shuffle using an illegal type.
29126 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29127 "WideVecVT should be legal");
29128
29129 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29130 DAG.getUNDEF(WideVecVT),
29131 ShuffleVec);
29132
29133 SDValue NewMask;
29134 SDValue Mask = Mst->getMask();
29135 if (Mask.getValueType() == VT) {
29136 // Mask and original value have the same type.
29137 NewMask = DAG.getBitcast(WideVecVT, Mask);
29138 for (unsigned i = 0; i != NumElems; ++i)
29139 ShuffleVec[i] = i * SizeRatio;
29140 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
29141 ShuffleVec[i] = NumElems*SizeRatio;
29142 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29143 DAG.getConstant(0, dl, WideVecVT),
29144 ShuffleVec);
29145 } else {
29146 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29147 unsigned WidenNumElts = NumElems*SizeRatio;
29148 unsigned MaskNumElts = VT.getVectorNumElements();
29149 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
29150 WidenNumElts);
29151
29152 unsigned NumConcat = WidenNumElts / MaskNumElts;
29153 SmallVector<SDValue, 16> Ops(NumConcat);
29154 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29155 Ops[0] = Mask;
29156 for (unsigned i = 1; i != NumConcat; ++i)
29157 Ops[i] = ZeroVal;
29158
29159 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29160 }
29161
29162 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
29163 Mst->getBasePtr(), NewMask, StVT,
29164 Mst->getMemOperand(), false);
29165 }
29166
combineStore(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29167 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
29168 const X86Subtarget &Subtarget) {
29169 StoreSDNode *St = cast<StoreSDNode>(N);
29170 EVT VT = St->getValue().getValueType();
29171 EVT StVT = St->getMemoryVT();
29172 SDLoc dl(St);
29173 SDValue StoredVal = St->getOperand(1);
29174 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29175
29176 // If we are saving a concatenation of two XMM registers and 32-byte stores
29177 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
29178 bool Fast;
29179 unsigned AddressSpace = St->getAddressSpace();
29180 unsigned Alignment = St->getAlignment();
29181 if (VT.is256BitVector() && StVT == VT &&
29182 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
29183 AddressSpace, Alignment, &Fast) &&
29184 !Fast) {
29185 unsigned NumElems = VT.getVectorNumElements();
29186 if (NumElems < 2)
29187 return SDValue();
29188
29189 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
29190 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
29191
29192 SDValue Ptr0 = St->getBasePtr();
29193 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
29194
29195 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
29196 St->getPointerInfo(), St->isVolatile(),
29197 St->isNonTemporal(), Alignment);
29198 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
29199 St->getPointerInfo(), St->isVolatile(),
29200 St->isNonTemporal(),
29201 std::min(16U, Alignment));
29202 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
29203 }
29204
29205 // Optimize trunc store (of multiple scalars) to shuffle and store.
29206 // First, pack all of the elements in one place. Next, store to memory
29207 // in fewer chunks.
29208 if (St->isTruncatingStore() && VT.isVector()) {
29209 // Check if we can detect an AVG pattern from the truncation. If yes,
29210 // replace the trunc store by a normal store with the result of X86ISD::AVG
29211 // instruction.
29212 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
29213 Subtarget, dl))
29214 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
29215 St->getPointerInfo(), St->isVolatile(),
29216 St->isNonTemporal(), St->getAlignment());
29217
29218 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29219 unsigned NumElems = VT.getVectorNumElements();
29220 assert(StVT != VT && "Cannot truncate to the same type");
29221 unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29222 unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29223
29224 // The truncating store is legal in some cases. For example
29225 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29226 // are designated for truncate store.
29227 // In this case we don't need any further transformations.
29228 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
29229 return SDValue();
29230
29231 // From, To sizes and ElemCount must be pow of two
29232 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
29233 // We are going to use the original vector elt for storing.
29234 // Accumulated smaller vector elements must be a multiple of the store size.
29235 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
29236
29237 unsigned SizeRatio = FromSz / ToSz;
29238
29239 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29240
29241 // Create a type on which we perform the shuffle
29242 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29243 StVT.getScalarType(), NumElems*SizeRatio);
29244
29245 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29246
29247 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
29248 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
29249 for (unsigned i = 0; i != NumElems; ++i)
29250 ShuffleVec[i] = i * SizeRatio;
29251
29252 // Can't shuffle using an illegal type.
29253 if (!TLI.isTypeLegal(WideVecVT))
29254 return SDValue();
29255
29256 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29257 DAG.getUNDEF(WideVecVT),
29258 ShuffleVec);
29259 // At this point all of the data is stored at the bottom of the
29260 // register. We now need to save it to mem.
29261
29262 // Find the largest store unit
29263 MVT StoreType = MVT::i8;
29264 for (MVT Tp : MVT::integer_valuetypes()) {
29265 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
29266 StoreType = Tp;
29267 }
29268
29269 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
29270 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
29271 (64 <= NumElems * ToSz))
29272 StoreType = MVT::f64;
29273
29274 // Bitcast the original vector into a vector of store-size units
29275 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
29276 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
29277 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
29278 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
29279 SmallVector<SDValue, 8> Chains;
29280 SDValue Ptr = St->getBasePtr();
29281
29282 // Perform one or more big stores into memory.
29283 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
29284 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
29285 StoreType, ShuffWide,
29286 DAG.getIntPtrConstant(i, dl));
29287 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
29288 St->getPointerInfo(), St->isVolatile(),
29289 St->isNonTemporal(), St->getAlignment());
29290 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
29291 Chains.push_back(Ch);
29292 }
29293
29294 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
29295 }
29296
29297 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
29298 // the FP state in cases where an emms may be missing.
29299 // A preferable solution to the general problem is to figure out the right
29300 // places to insert EMMS. This qualifies as a quick hack.
29301
29302 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
29303 if (VT.getSizeInBits() != 64)
29304 return SDValue();
29305
29306 const Function *F = DAG.getMachineFunction().getFunction();
29307 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
29308 bool F64IsLegal =
29309 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
29310 if ((VT.isVector() ||
29311 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
29312 isa<LoadSDNode>(St->getValue()) &&
29313 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
29314 St->getChain().hasOneUse() && !St->isVolatile()) {
29315 SDNode* LdVal = St->getValue().getNode();
29316 LoadSDNode *Ld = nullptr;
29317 int TokenFactorIndex = -1;
29318 SmallVector<SDValue, 8> Ops;
29319 SDNode* ChainVal = St->getChain().getNode();
29320 // Must be a store of a load. We currently handle two cases: the load
29321 // is a direct child, and it's under an intervening TokenFactor. It is
29322 // possible to dig deeper under nested TokenFactors.
29323 if (ChainVal == LdVal)
29324 Ld = cast<LoadSDNode>(St->getChain());
29325 else if (St->getValue().hasOneUse() &&
29326 ChainVal->getOpcode() == ISD::TokenFactor) {
29327 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
29328 if (ChainVal->getOperand(i).getNode() == LdVal) {
29329 TokenFactorIndex = i;
29330 Ld = cast<LoadSDNode>(St->getValue());
29331 } else
29332 Ops.push_back(ChainVal->getOperand(i));
29333 }
29334 }
29335
29336 if (!Ld || !ISD::isNormalLoad(Ld))
29337 return SDValue();
29338
29339 // If this is not the MMX case, i.e. we are just turning i64 load/store
29340 // into f64 load/store, avoid the transformation if there are multiple
29341 // uses of the loaded value.
29342 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
29343 return SDValue();
29344
29345 SDLoc LdDL(Ld);
29346 SDLoc StDL(N);
29347 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
29348 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
29349 // pair instead.
29350 if (Subtarget.is64Bit() || F64IsLegal) {
29351 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
29352 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
29353 Ld->getPointerInfo(), Ld->isVolatile(),
29354 Ld->isNonTemporal(), Ld->isInvariant(),
29355 Ld->getAlignment());
29356 SDValue NewChain = NewLd.getValue(1);
29357 if (TokenFactorIndex >= 0) {
29358 Ops.push_back(NewChain);
29359 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29360 }
29361 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
29362 St->getPointerInfo(),
29363 St->isVolatile(), St->isNonTemporal(),
29364 St->getAlignment());
29365 }
29366
29367 // Otherwise, lower to two pairs of 32-bit loads / stores.
29368 SDValue LoAddr = Ld->getBasePtr();
29369 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
29370
29371 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
29372 Ld->getPointerInfo(),
29373 Ld->isVolatile(), Ld->isNonTemporal(),
29374 Ld->isInvariant(), Ld->getAlignment());
29375 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
29376 Ld->getPointerInfo().getWithOffset(4),
29377 Ld->isVolatile(), Ld->isNonTemporal(),
29378 Ld->isInvariant(),
29379 MinAlign(Ld->getAlignment(), 4));
29380
29381 SDValue NewChain = LoLd.getValue(1);
29382 if (TokenFactorIndex >= 0) {
29383 Ops.push_back(LoLd);
29384 Ops.push_back(HiLd);
29385 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29386 }
29387
29388 LoAddr = St->getBasePtr();
29389 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
29390
29391 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
29392 St->getPointerInfo(),
29393 St->isVolatile(), St->isNonTemporal(),
29394 St->getAlignment());
29395 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
29396 St->getPointerInfo().getWithOffset(4),
29397 St->isVolatile(),
29398 St->isNonTemporal(),
29399 MinAlign(St->getAlignment(), 4));
29400 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
29401 }
29402
29403 // This is similar to the above case, but here we handle a scalar 64-bit
29404 // integer store that is extracted from a vector on a 32-bit target.
29405 // If we have SSE2, then we can treat it like a floating-point double
29406 // to get past legalization. The execution dependencies fixup pass will
29407 // choose the optimal machine instruction for the store if this really is
29408 // an integer or v2f32 rather than an f64.
29409 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
29410 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
29411 SDValue OldExtract = St->getOperand(1);
29412 SDValue ExtOp0 = OldExtract.getOperand(0);
29413 unsigned VecSize = ExtOp0.getValueSizeInBits();
29414 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
29415 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
29416 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
29417 BitCast, OldExtract.getOperand(1));
29418 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
29419 St->getPointerInfo(), St->isVolatile(),
29420 St->isNonTemporal(), St->getAlignment());
29421 }
29422
29423 return SDValue();
29424 }
29425
29426 /// Return 'true' if this vector operation is "horizontal"
29427 /// and return the operands for the horizontal operation in LHS and RHS. A
29428 /// horizontal operation performs the binary operation on successive elements
29429 /// of its first operand, then on successive elements of its second operand,
29430 /// returning the resulting values in a vector. For example, if
29431 /// A = < float a0, float a1, float a2, float a3 >
29432 /// and
29433 /// B = < float b0, float b1, float b2, float b3 >
29434 /// then the result of doing a horizontal operation on A and B is
29435 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
29436 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
29437 /// A horizontal-op B, for some already available A and B, and if so then LHS is
29438 /// set to A, RHS to B, and the routine returns 'true'.
29439 /// Note that the binary operation should have the property that if one of the
29440 /// operands is UNDEF then the result is UNDEF.
isHorizontalBinOp(SDValue & LHS,SDValue & RHS,bool IsCommutative)29441 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
29442 // Look for the following pattern: if
29443 // A = < float a0, float a1, float a2, float a3 >
29444 // B = < float b0, float b1, float b2, float b3 >
29445 // and
29446 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
29447 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
29448 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
29449 // which is A horizontal-op B.
29450
29451 // At least one of the operands should be a vector shuffle.
29452 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
29453 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
29454 return false;
29455
29456 MVT VT = LHS.getSimpleValueType();
29457
29458 assert((VT.is128BitVector() || VT.is256BitVector()) &&
29459 "Unsupported vector type for horizontal add/sub");
29460
29461 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
29462 // operate independently on 128-bit lanes.
29463 unsigned NumElts = VT.getVectorNumElements();
29464 unsigned NumLanes = VT.getSizeInBits()/128;
29465 unsigned NumLaneElts = NumElts / NumLanes;
29466 assert((NumLaneElts % 2 == 0) &&
29467 "Vector type should have an even number of elements in each lane");
29468 unsigned HalfLaneElts = NumLaneElts/2;
29469
29470 // View LHS in the form
29471 // LHS = VECTOR_SHUFFLE A, B, LMask
29472 // If LHS is not a shuffle then pretend it is the shuffle
29473 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
29474 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
29475 // type VT.
29476 SDValue A, B;
29477 SmallVector<int, 16> LMask(NumElts);
29478 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29479 if (!LHS.getOperand(0).isUndef())
29480 A = LHS.getOperand(0);
29481 if (!LHS.getOperand(1).isUndef())
29482 B = LHS.getOperand(1);
29483 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
29484 std::copy(Mask.begin(), Mask.end(), LMask.begin());
29485 } else {
29486 if (!LHS.isUndef())
29487 A = LHS;
29488 for (unsigned i = 0; i != NumElts; ++i)
29489 LMask[i] = i;
29490 }
29491
29492 // Likewise, view RHS in the form
29493 // RHS = VECTOR_SHUFFLE C, D, RMask
29494 SDValue C, D;
29495 SmallVector<int, 16> RMask(NumElts);
29496 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29497 if (!RHS.getOperand(0).isUndef())
29498 C = RHS.getOperand(0);
29499 if (!RHS.getOperand(1).isUndef())
29500 D = RHS.getOperand(1);
29501 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
29502 std::copy(Mask.begin(), Mask.end(), RMask.begin());
29503 } else {
29504 if (!RHS.isUndef())
29505 C = RHS;
29506 for (unsigned i = 0; i != NumElts; ++i)
29507 RMask[i] = i;
29508 }
29509
29510 // Check that the shuffles are both shuffling the same vectors.
29511 if (!(A == C && B == D) && !(A == D && B == C))
29512 return false;
29513
29514 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
29515 if (!A.getNode() && !B.getNode())
29516 return false;
29517
29518 // If A and B occur in reverse order in RHS, then "swap" them (which means
29519 // rewriting the mask).
29520 if (A != C)
29521 ShuffleVectorSDNode::commuteMask(RMask);
29522
29523 // At this point LHS and RHS are equivalent to
29524 // LHS = VECTOR_SHUFFLE A, B, LMask
29525 // RHS = VECTOR_SHUFFLE A, B, RMask
29526 // Check that the masks correspond to performing a horizontal operation.
29527 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
29528 for (unsigned i = 0; i != NumLaneElts; ++i) {
29529 int LIdx = LMask[i+l], RIdx = RMask[i+l];
29530
29531 // Ignore any UNDEF components.
29532 if (LIdx < 0 || RIdx < 0 ||
29533 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
29534 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
29535 continue;
29536
29537 // Check that successive elements are being operated on. If not, this is
29538 // not a horizontal operation.
29539 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
29540 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
29541 if (!(LIdx == Index && RIdx == Index + 1) &&
29542 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
29543 return false;
29544 }
29545 }
29546
29547 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
29548 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
29549 return true;
29550 }
29551
29552 /// Do target-specific dag combines on floating-point adds/subs.
combineFaddFsub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29553 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
29554 const X86Subtarget &Subtarget) {
29555 EVT VT = N->getValueType(0);
29556 SDValue LHS = N->getOperand(0);
29557 SDValue RHS = N->getOperand(1);
29558 bool IsFadd = N->getOpcode() == ISD::FADD;
29559 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
29560
29561 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
29562 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
29563 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
29564 isHorizontalBinOp(LHS, RHS, IsFadd)) {
29565 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
29566 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
29567 }
29568 return SDValue();
29569 }
29570
29571 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
29572 static SDValue
combineVectorTruncationWithPACKUS(SDNode * N,SelectionDAG & DAG,SmallVector<SDValue,8> & Regs)29573 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
29574 SmallVector<SDValue, 8> &Regs) {
29575 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
29576 Regs[0].getValueType() == MVT::v2i64));
29577 EVT OutVT = N->getValueType(0);
29578 EVT OutSVT = OutVT.getVectorElementType();
29579 EVT InVT = Regs[0].getValueType();
29580 EVT InSVT = InVT.getVectorElementType();
29581 SDLoc DL(N);
29582
29583 // First, use mask to unset all bits that won't appear in the result.
29584 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
29585 "OutSVT can only be either i8 or i16.");
29586 APInt Mask =
29587 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
29588 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
29589 for (auto &Reg : Regs)
29590 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
29591
29592 MVT UnpackedVT, PackedVT;
29593 if (OutSVT == MVT::i8) {
29594 UnpackedVT = MVT::v8i16;
29595 PackedVT = MVT::v16i8;
29596 } else {
29597 UnpackedVT = MVT::v4i32;
29598 PackedVT = MVT::v8i16;
29599 }
29600
29601 // In each iteration, truncate the type by a half size.
29602 auto RegNum = Regs.size();
29603 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
29604 j < e; j *= 2, RegNum /= 2) {
29605 for (unsigned i = 0; i < RegNum; i++)
29606 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
29607 for (unsigned i = 0; i < RegNum / 2; i++)
29608 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
29609 Regs[i * 2 + 1]);
29610 }
29611
29612 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
29613 // then extract a subvector as the result since v8i8 is not a legal type.
29614 if (OutVT == MVT::v8i8) {
29615 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
29616 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
29617 DAG.getIntPtrConstant(0, DL));
29618 return Regs[0];
29619 } else if (RegNum > 1) {
29620 Regs.resize(RegNum);
29621 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29622 } else
29623 return Regs[0];
29624 }
29625
29626 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
29627 static SDValue
combineVectorTruncationWithPACKSS(SDNode * N,SelectionDAG & DAG,SmallVector<SDValue,8> & Regs)29628 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
29629 SmallVector<SDValue, 8> &Regs) {
29630 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
29631 EVT OutVT = N->getValueType(0);
29632 SDLoc DL(N);
29633
29634 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
29635 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
29636 for (auto &Reg : Regs) {
29637 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29638 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29639 }
29640
29641 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
29642 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
29643 Regs[i * 2 + 1]);
29644
29645 if (Regs.size() > 2) {
29646 Regs.resize(Regs.size() / 2);
29647 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29648 } else
29649 return Regs[0];
29650 }
29651
29652 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
29653 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
29654 /// legalization the truncation will be translated into a BUILD_VECTOR with each
29655 /// element that is extracted from a vector and then truncated, and it is
29656 /// diffcult to do this optimization based on them.
combineVectorTruncation(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29657 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
29658 const X86Subtarget &Subtarget) {
29659 EVT OutVT = N->getValueType(0);
29660 if (!OutVT.isVector())
29661 return SDValue();
29662
29663 SDValue In = N->getOperand(0);
29664 if (!In.getValueType().isSimple())
29665 return SDValue();
29666
29667 EVT InVT = In.getValueType();
29668 unsigned NumElems = OutVT.getVectorNumElements();
29669
29670 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
29671 // SSE2, and we need to take care of it specially.
29672 // AVX512 provides vpmovdb.
29673 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
29674 return SDValue();
29675
29676 EVT OutSVT = OutVT.getVectorElementType();
29677 EVT InSVT = InVT.getVectorElementType();
29678 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
29679 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
29680 NumElems >= 8))
29681 return SDValue();
29682
29683 // SSSE3's pshufb results in less instructions in the cases below.
29684 if (Subtarget.hasSSSE3() && NumElems == 8 &&
29685 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
29686 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
29687 return SDValue();
29688
29689 SDLoc DL(N);
29690
29691 // Split a long vector into vectors of legal type.
29692 unsigned RegNum = InVT.getSizeInBits() / 128;
29693 SmallVector<SDValue, 8> SubVec(RegNum);
29694 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
29695 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
29696
29697 for (unsigned i = 0; i < RegNum; i++)
29698 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
29699 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
29700
29701 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
29702 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
29703 // truncate 2 x v4i32 to v8i16.
29704 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
29705 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
29706 else if (InSVT == MVT::i32)
29707 return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
29708 else
29709 return SDValue();
29710 }
29711
combineTruncate(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29712 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
29713 const X86Subtarget &Subtarget) {
29714 EVT VT = N->getValueType(0);
29715 SDValue Src = N->getOperand(0);
29716 SDLoc DL(N);
29717
29718 // Try to detect AVG pattern first.
29719 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
29720 return Avg;
29721
29722 // The bitcast source is a direct mmx result.
29723 // Detect bitcasts between i32 to x86mmx
29724 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
29725 SDValue BCSrc = Src.getOperand(0);
29726 if (BCSrc.getValueType() == MVT::x86mmx)
29727 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
29728 }
29729
29730 return combineVectorTruncation(N, DAG, Subtarget);
29731 }
29732
29733 /// Do target-specific dag combines on floating point negations.
combineFneg(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29734 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
29735 const X86Subtarget &Subtarget) {
29736 EVT VT = N->getValueType(0);
29737 EVT SVT = VT.getScalarType();
29738 SDValue Arg = N->getOperand(0);
29739 SDLoc DL(N);
29740
29741 // Let legalize expand this if it isn't a legal type yet.
29742 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29743 return SDValue();
29744
29745 // If we're negating a FMUL node on a target with FMA, then we can avoid the
29746 // use of a constant by performing (-0 - A*B) instead.
29747 // FIXME: Check rounding control flags as well once it becomes available.
29748 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
29749 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
29750 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
29751 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29752 Arg.getOperand(1), Zero);
29753 }
29754
29755 // If we're negating a FMA node, then we can adjust the
29756 // instruction to include the extra negation.
29757 if (Arg.hasOneUse()) {
29758 switch (Arg.getOpcode()) {
29759 case X86ISD::FMADD:
29760 return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29761 Arg.getOperand(1), Arg.getOperand(2));
29762 case X86ISD::FMSUB:
29763 return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
29764 Arg.getOperand(1), Arg.getOperand(2));
29765 case X86ISD::FNMADD:
29766 return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
29767 Arg.getOperand(1), Arg.getOperand(2));
29768 case X86ISD::FNMSUB:
29769 return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
29770 Arg.getOperand(1), Arg.getOperand(2));
29771 }
29772 }
29773 return SDValue();
29774 }
29775
lowerX86FPLogicOp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29776 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
29777 const X86Subtarget &Subtarget) {
29778 EVT VT = N->getValueType(0);
29779 if (VT.is512BitVector() && !Subtarget.hasDQI()) {
29780 // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
29781 // These logic operations may be executed in the integer domain.
29782 SDLoc dl(N);
29783 MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
29784 MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
29785
29786 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
29787 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
29788 unsigned IntOpcode = 0;
29789 switch (N->getOpcode()) {
29790 default: llvm_unreachable("Unexpected FP logic op");
29791 case X86ISD::FOR: IntOpcode = ISD::OR; break;
29792 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
29793 case X86ISD::FAND: IntOpcode = ISD::AND; break;
29794 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
29795 }
29796 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
29797 return DAG.getBitcast(VT, IntOp);
29798 }
29799 return SDValue();
29800 }
29801 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
combineFOr(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29802 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
29803 const X86Subtarget &Subtarget) {
29804 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
29805
29806 // F[X]OR(0.0, x) -> x
29807 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29808 if (C->getValueAPF().isPosZero())
29809 return N->getOperand(1);
29810
29811 // F[X]OR(x, 0.0) -> x
29812 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29813 if (C->getValueAPF().isPosZero())
29814 return N->getOperand(0);
29815
29816 return lowerX86FPLogicOp(N, DAG, Subtarget);
29817 }
29818
29819 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
combineFMinFMax(SDNode * N,SelectionDAG & DAG)29820 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
29821 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
29822
29823 // Only perform optimizations if UnsafeMath is used.
29824 if (!DAG.getTarget().Options.UnsafeFPMath)
29825 return SDValue();
29826
29827 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
29828 // into FMINC and FMAXC, which are Commutative operations.
29829 unsigned NewOp = 0;
29830 switch (N->getOpcode()) {
29831 default: llvm_unreachable("unknown opcode");
29832 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
29833 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
29834 }
29835
29836 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
29837 N->getOperand(0), N->getOperand(1));
29838 }
29839
combineFMinNumFMaxNum(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29840 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
29841 const X86Subtarget &Subtarget) {
29842 if (Subtarget.useSoftFloat())
29843 return SDValue();
29844
29845 // TODO: Check for global or instruction-level "nnan". In that case, we
29846 // should be able to lower to FMAX/FMIN alone.
29847 // TODO: If an operand is already known to be a NaN or not a NaN, this
29848 // should be an optional swap and FMAX/FMIN.
29849
29850 EVT VT = N->getValueType(0);
29851 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
29852 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
29853 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
29854 return SDValue();
29855
29856 // This takes at least 3 instructions, so favor a library call when operating
29857 // on a scalar and minimizing code size.
29858 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
29859 return SDValue();
29860
29861 SDValue Op0 = N->getOperand(0);
29862 SDValue Op1 = N->getOperand(1);
29863 SDLoc DL(N);
29864 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
29865 DAG.getDataLayout(), *DAG.getContext(), VT);
29866
29867 // There are 4 possibilities involving NaN inputs, and these are the required
29868 // outputs:
29869 // Op1
29870 // Num NaN
29871 // ----------------
29872 // Num | Max | Op0 |
29873 // Op0 ----------------
29874 // NaN | Op1 | NaN |
29875 // ----------------
29876 //
29877 // The SSE FP max/min instructions were not designed for this case, but rather
29878 // to implement:
29879 // Min = Op1 < Op0 ? Op1 : Op0
29880 // Max = Op1 > Op0 ? Op1 : Op0
29881 //
29882 // So they always return Op0 if either input is a NaN. However, we can still
29883 // use those instructions for fmaxnum by selecting away a NaN input.
29884
29885 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
29886 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
29887 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
29888 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
29889
29890 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
29891 // are NaN, the NaN value of Op1 is the result.
29892 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
29893 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
29894 }
29895
29896 /// Do target-specific dag combines on X86ISD::FAND nodes.
combineFAnd(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29897 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
29898 const X86Subtarget &Subtarget) {
29899 // FAND(0.0, x) -> 0.0
29900 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29901 if (C->getValueAPF().isPosZero())
29902 return N->getOperand(0);
29903
29904 // FAND(x, 0.0) -> 0.0
29905 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29906 if (C->getValueAPF().isPosZero())
29907 return N->getOperand(1);
29908
29909 return lowerX86FPLogicOp(N, DAG, Subtarget);
29910 }
29911
29912 /// Do target-specific dag combines on X86ISD::FANDN nodes
combineFAndn(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29913 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
29914 const X86Subtarget &Subtarget) {
29915 // FANDN(0.0, x) -> x
29916 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29917 if (C->getValueAPF().isPosZero())
29918 return N->getOperand(1);
29919
29920 // FANDN(x, 0.0) -> 0.0
29921 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29922 if (C->getValueAPF().isPosZero())
29923 return N->getOperand(1);
29924
29925 return lowerX86FPLogicOp(N, DAG, Subtarget);
29926 }
29927
combineBT(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)29928 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
29929 TargetLowering::DAGCombinerInfo &DCI) {
29930 // BT ignores high bits in the bit index operand.
29931 SDValue Op1 = N->getOperand(1);
29932 if (Op1.hasOneUse()) {
29933 unsigned BitWidth = Op1.getValueSizeInBits();
29934 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
29935 APInt KnownZero, KnownOne;
29936 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
29937 !DCI.isBeforeLegalizeOps());
29938 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29939 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
29940 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
29941 DCI.CommitTargetLoweringOpt(TLO);
29942 }
29943 return SDValue();
29944 }
29945
combineVZextMovl(SDNode * N,SelectionDAG & DAG)29946 static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
29947 SDValue Op = peekThroughBitcasts(N->getOperand(0));
29948 EVT VT = N->getValueType(0), OpVT = Op.getValueType();
29949 if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
29950 VT.getVectorElementType().getSizeInBits() ==
29951 OpVT.getVectorElementType().getSizeInBits()) {
29952 return DAG.getBitcast(VT, Op);
29953 }
29954 return SDValue();
29955 }
29956
combineSignExtendInReg(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29957 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
29958 const X86Subtarget &Subtarget) {
29959 EVT VT = N->getValueType(0);
29960 if (!VT.isVector())
29961 return SDValue();
29962
29963 SDValue N0 = N->getOperand(0);
29964 SDValue N1 = N->getOperand(1);
29965 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
29966 SDLoc dl(N);
29967
29968 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
29969 // both SSE and AVX2 since there is no sign-extended shift right
29970 // operation on a vector with 64-bit elements.
29971 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
29972 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
29973 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
29974 N0.getOpcode() == ISD::SIGN_EXTEND)) {
29975 SDValue N00 = N0.getOperand(0);
29976
29977 // EXTLOAD has a better solution on AVX2,
29978 // it may be replaced with X86ISD::VSEXT node.
29979 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
29980 if (!ISD::isNormalLoad(N00.getNode()))
29981 return SDValue();
29982
29983 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
29984 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
29985 N00, N1);
29986 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
29987 }
29988 }
29989 return SDValue();
29990 }
29991
29992 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
29993 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
29994 /// to combine math ops, use an LEA, or use a complex addressing mode. This can
29995 /// eliminate extend, add, and shift instructions.
promoteSextBeforeAddNSW(SDNode * Sext,SelectionDAG & DAG,const X86Subtarget & Subtarget)29996 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
29997 const X86Subtarget &Subtarget) {
29998 // TODO: This should be valid for other integer types.
29999 EVT VT = Sext->getValueType(0);
30000 if (VT != MVT::i64)
30001 return SDValue();
30002
30003 // We need an 'add nsw' feeding into the 'sext'.
30004 SDValue Add = Sext->getOperand(0);
30005 if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
30006 return SDValue();
30007
30008 // Having a constant operand to the 'add' ensures that we are not increasing
30009 // the instruction count because the constant is extended for free below.
30010 // A constant operand can also become the displacement field of an LEA.
30011 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
30012 if (!AddOp1)
30013 return SDValue();
30014
30015 // Don't make the 'add' bigger if there's no hope of combining it with some
30016 // other 'add' or 'shl' instruction.
30017 // TODO: It may be profitable to generate simpler LEA instructions in place
30018 // of single 'add' instructions, but the cost model for selecting an LEA
30019 // currently has a high threshold.
30020 bool HasLEAPotential = false;
30021 for (auto *User : Sext->uses()) {
30022 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
30023 HasLEAPotential = true;
30024 break;
30025 }
30026 }
30027 if (!HasLEAPotential)
30028 return SDValue();
30029
30030 // Everything looks good, so pull the 'sext' ahead of the 'add'.
30031 int64_t AddConstant = AddOp1->getSExtValue();
30032 SDValue AddOp0 = Add.getOperand(0);
30033 SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
30034 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
30035
30036 // The wider add is guaranteed to not wrap because both operands are
30037 // sign-extended.
30038 SDNodeFlags Flags;
30039 Flags.setNoSignedWrap(true);
30040 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
30041 }
30042
30043 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
30044 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
30045 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
30046 /// extends from AH (which we otherwise need to do contortions to access).
getDivRem8(SDNode * N,SelectionDAG & DAG)30047 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
30048 SDValue N0 = N->getOperand(0);
30049 auto OpcodeN = N->getOpcode();
30050 auto OpcodeN0 = N0.getOpcode();
30051 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
30052 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
30053 return SDValue();
30054
30055 EVT VT = N->getValueType(0);
30056 EVT InVT = N0.getValueType();
30057 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
30058 return SDValue();
30059
30060 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
30061 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
30062 : X86ISD::UDIVREM8_ZEXT_HREG;
30063 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
30064 N0.getOperand(1));
30065 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
30066 return R.getValue(1);
30067 }
30068
30069 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
30070 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
30071 /// with UNDEFs) of the input to vectors of the same size as the target type
30072 /// which then extends the lowest elements.
combineToExtendVectorInReg(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30073 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
30074 TargetLowering::DAGCombinerInfo &DCI,
30075 const X86Subtarget &Subtarget) {
30076 unsigned Opcode = N->getOpcode();
30077 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
30078 return SDValue();
30079 if (!DCI.isBeforeLegalizeOps())
30080 return SDValue();
30081 if (!Subtarget.hasSSE2())
30082 return SDValue();
30083
30084 SDValue N0 = N->getOperand(0);
30085 EVT VT = N->getValueType(0);
30086 EVT SVT = VT.getScalarType();
30087 EVT InVT = N0.getValueType();
30088 EVT InSVT = InVT.getScalarType();
30089
30090 // Input type must be a vector and we must be extending legal integer types.
30091 if (!VT.isVector())
30092 return SDValue();
30093 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
30094 return SDValue();
30095 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
30096 return SDValue();
30097
30098 // On AVX2+ targets, if the input/output types are both legal then we will be
30099 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
30100 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
30101 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
30102 return SDValue();
30103
30104 SDLoc DL(N);
30105
30106 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
30107 EVT InVT = N.getValueType();
30108 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
30109 Size / InVT.getScalarSizeInBits());
30110 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
30111 DAG.getUNDEF(InVT));
30112 Opnds[0] = N;
30113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
30114 };
30115
30116 // If target-size is less than 128-bits, extend to a type that would extend
30117 // to 128 bits, extend that and extract the original target vector.
30118 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
30119 unsigned Scale = 128 / VT.getSizeInBits();
30120 EVT ExVT =
30121 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
30122 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
30123 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
30124 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
30125 DAG.getIntPtrConstant(0, DL));
30126 }
30127
30128 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
30129 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
30130 // Also use this if we don't have SSE41 to allow the legalizer do its job.
30131 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
30132 (VT.is256BitVector() && Subtarget.hasInt256())) {
30133 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
30134 return Opcode == ISD::SIGN_EXTEND
30135 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
30136 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
30137 }
30138
30139 // On pre-AVX2 targets, split into 128-bit nodes of
30140 // ISD::*_EXTEND_VECTOR_INREG.
30141 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
30142 unsigned NumVecs = VT.getSizeInBits() / 128;
30143 unsigned NumSubElts = 128 / SVT.getSizeInBits();
30144 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
30145 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
30146
30147 SmallVector<SDValue, 8> Opnds;
30148 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
30149 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
30150 DAG.getIntPtrConstant(Offset, DL));
30151 SrcVec = ExtendVecSize(DL, SrcVec, 128);
30152 SrcVec = Opcode == ISD::SIGN_EXTEND
30153 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
30154 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
30155 Opnds.push_back(SrcVec);
30156 }
30157 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
30158 }
30159
30160 return SDValue();
30161 }
30162
combineSext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30163 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
30164 TargetLowering::DAGCombinerInfo &DCI,
30165 const X86Subtarget &Subtarget) {
30166 SDValue N0 = N->getOperand(0);
30167 EVT VT = N->getValueType(0);
30168 EVT InVT = N0.getValueType();
30169 SDLoc DL(N);
30170
30171 if (SDValue DivRem8 = getDivRem8(N, DAG))
30172 return DivRem8;
30173
30174 if (!DCI.isBeforeLegalizeOps()) {
30175 if (InVT == MVT::i1) {
30176 SDValue Zero = DAG.getConstant(0, DL, VT);
30177 SDValue AllOnes =
30178 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
30179 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
30180 }
30181 return SDValue();
30182 }
30183
30184 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30185 return V;
30186
30187 if (Subtarget.hasAVX() && VT.is256BitVector())
30188 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30189 return R;
30190
30191 if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
30192 return NewAdd;
30193
30194 return SDValue();
30195 }
30196
combineFMA(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30197 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
30198 const X86Subtarget &Subtarget) {
30199 SDLoc dl(N);
30200 EVT VT = N->getValueType(0);
30201
30202 // Let legalize expand this if it isn't a legal type yet.
30203 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30204 return SDValue();
30205
30206 EVT ScalarVT = VT.getScalarType();
30207 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
30208 return SDValue();
30209
30210 SDValue A = N->getOperand(0);
30211 SDValue B = N->getOperand(1);
30212 SDValue C = N->getOperand(2);
30213
30214 bool NegA = (A.getOpcode() == ISD::FNEG);
30215 bool NegB = (B.getOpcode() == ISD::FNEG);
30216 bool NegC = (C.getOpcode() == ISD::FNEG);
30217
30218 // Negative multiplication when NegA xor NegB
30219 bool NegMul = (NegA != NegB);
30220 if (NegA)
30221 A = A.getOperand(0);
30222 if (NegB)
30223 B = B.getOperand(0);
30224 if (NegC)
30225 C = C.getOperand(0);
30226
30227 unsigned Opcode;
30228 if (!NegMul)
30229 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
30230 else
30231 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
30232
30233 return DAG.getNode(Opcode, dl, VT, A, B, C);
30234 }
30235
combineZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30236 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
30237 TargetLowering::DAGCombinerInfo &DCI,
30238 const X86Subtarget &Subtarget) {
30239 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
30240 // (and (i32 x86isd::setcc_carry), 1)
30241 // This eliminates the zext. This transformation is necessary because
30242 // ISD::SETCC is always legalized to i8.
30243 SDLoc dl(N);
30244 SDValue N0 = N->getOperand(0);
30245 EVT VT = N->getValueType(0);
30246
30247 if (N0.getOpcode() == ISD::AND &&
30248 N0.hasOneUse() &&
30249 N0.getOperand(0).hasOneUse()) {
30250 SDValue N00 = N0.getOperand(0);
30251 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30252 if (!isOneConstant(N0.getOperand(1)))
30253 return SDValue();
30254 return DAG.getNode(ISD::AND, dl, VT,
30255 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30256 N00.getOperand(0), N00.getOperand(1)),
30257 DAG.getConstant(1, dl, VT));
30258 }
30259 }
30260
30261 if (N0.getOpcode() == ISD::TRUNCATE &&
30262 N0.hasOneUse() &&
30263 N0.getOperand(0).hasOneUse()) {
30264 SDValue N00 = N0.getOperand(0);
30265 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30266 return DAG.getNode(ISD::AND, dl, VT,
30267 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30268 N00.getOperand(0), N00.getOperand(1)),
30269 DAG.getConstant(1, dl, VT));
30270 }
30271 }
30272
30273 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30274 return V;
30275
30276 if (VT.is256BitVector())
30277 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30278 return R;
30279
30280 if (SDValue DivRem8 = getDivRem8(N, DAG))
30281 return DivRem8;
30282
30283 return SDValue();
30284 }
30285
30286 /// Optimize x == -y --> x+y == 0
30287 /// x != -y --> x+y != 0
combineSetCC(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30288 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
30289 const X86Subtarget &Subtarget) {
30290 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
30291 SDValue LHS = N->getOperand(0);
30292 SDValue RHS = N->getOperand(1);
30293 EVT VT = N->getValueType(0);
30294 SDLoc DL(N);
30295
30296 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
30297 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
30298 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
30299 LHS.getOperand(1));
30300 return DAG.getSetCC(DL, N->getValueType(0), addV,
30301 DAG.getConstant(0, DL, addV.getValueType()), CC);
30302 }
30303 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
30304 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
30305 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
30306 RHS.getOperand(1));
30307 return DAG.getSetCC(DL, N->getValueType(0), addV,
30308 DAG.getConstant(0, DL, addV.getValueType()), CC);
30309 }
30310
30311 if (VT.getScalarType() == MVT::i1 &&
30312 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
30313 bool IsSEXT0 =
30314 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30315 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30316 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30317
30318 if (!IsSEXT0 || !IsVZero1) {
30319 // Swap the operands and update the condition code.
30320 std::swap(LHS, RHS);
30321 CC = ISD::getSetCCSwappedOperands(CC);
30322
30323 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30324 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30325 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30326 }
30327
30328 if (IsSEXT0 && IsVZero1) {
30329 assert(VT == LHS.getOperand(0).getValueType() &&
30330 "Uexpected operand type");
30331 if (CC == ISD::SETGT)
30332 return DAG.getConstant(0, DL, VT);
30333 if (CC == ISD::SETLE)
30334 return DAG.getConstant(1, DL, VT);
30335 if (CC == ISD::SETEQ || CC == ISD::SETGE)
30336 return DAG.getNOT(DL, LHS.getOperand(0), VT);
30337
30338 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
30339 "Unexpected condition code!");
30340 return LHS.getOperand(0);
30341 }
30342 }
30343
30344 // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
30345 // via legalization because v4i32 is not a legal type.
30346 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
30347 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
30348
30349 return SDValue();
30350 }
30351
combineGatherScatter(SDNode * N,SelectionDAG & DAG)30352 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
30353 SDLoc DL(N);
30354 // Gather and Scatter instructions use k-registers for masks. The type of
30355 // the masks is v*i1. So the mask will be truncated anyway.
30356 // The SIGN_EXTEND_INREG my be dropped.
30357 SDValue Mask = N->getOperand(2);
30358 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
30359 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
30360 NewOps[2] = Mask.getOperand(0);
30361 DAG.UpdateNodeOperands(N, NewOps);
30362 }
30363 return SDValue();
30364 }
30365
30366 // Helper function of performSETCCCombine. It is to materialize "setb reg"
30367 // as "sbb reg,reg", since it can be extended without zext and produces
30368 // an all-ones bit which is more useful than 0/1 in some cases.
MaterializeSETB(const SDLoc & DL,SDValue EFLAGS,SelectionDAG & DAG,MVT VT)30369 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
30370 SelectionDAG &DAG, MVT VT) {
30371 if (VT == MVT::i8)
30372 return DAG.getNode(ISD::AND, DL, VT,
30373 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30374 DAG.getConstant(X86::COND_B, DL, MVT::i8),
30375 EFLAGS),
30376 DAG.getConstant(1, DL, VT));
30377 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
30378 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
30379 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30380 DAG.getConstant(X86::COND_B, DL, MVT::i8),
30381 EFLAGS));
30382 }
30383
30384 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
combineX86SetCC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30385 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
30386 TargetLowering::DAGCombinerInfo &DCI,
30387 const X86Subtarget &Subtarget) {
30388 SDLoc DL(N);
30389 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
30390 SDValue EFLAGS = N->getOperand(1);
30391
30392 if (CC == X86::COND_A) {
30393 // Try to convert COND_A into COND_B in an attempt to facilitate
30394 // materializing "setb reg".
30395 //
30396 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
30397 // cannot take an immediate as its first operand.
30398 //
30399 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
30400 EFLAGS.getValueType().isInteger() &&
30401 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
30402 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
30403 EFLAGS.getNode()->getVTList(),
30404 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
30405 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
30406 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
30407 }
30408 }
30409
30410 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
30411 // a zext and produces an all-ones bit which is more useful than 0/1 in some
30412 // cases.
30413 if (CC == X86::COND_B)
30414 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
30415
30416 // Try to simplify the EFLAGS and condition code operands.
30417 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30418 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30419 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
30420 }
30421
30422 return SDValue();
30423 }
30424
30425 /// Optimize branch condition evaluation.
combineBrCond(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30426 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
30427 TargetLowering::DAGCombinerInfo &DCI,
30428 const X86Subtarget &Subtarget) {
30429 SDLoc DL(N);
30430 SDValue EFLAGS = N->getOperand(3);
30431 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
30432
30433 // Try to simplify the EFLAGS and condition code operands.
30434 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
30435 // RAUW them under us.
30436 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30437 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30438 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
30439 N->getOperand(1), Cond, Flags);
30440 }
30441
30442 return SDValue();
30443 }
30444
combineVectorCompareAndMaskUnaryOp(SDNode * N,SelectionDAG & DAG)30445 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
30446 SelectionDAG &DAG) {
30447 // Take advantage of vector comparisons producing 0 or -1 in each lane to
30448 // optimize away operation when it's from a constant.
30449 //
30450 // The general transformation is:
30451 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
30452 // AND(VECTOR_CMP(x,y), constant2)
30453 // constant2 = UNARYOP(constant)
30454
30455 // Early exit if this isn't a vector operation, the operand of the
30456 // unary operation isn't a bitwise AND, or if the sizes of the operations
30457 // aren't the same.
30458 EVT VT = N->getValueType(0);
30459 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
30460 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
30461 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
30462 return SDValue();
30463
30464 // Now check that the other operand of the AND is a constant. We could
30465 // make the transformation for non-constant splats as well, but it's unclear
30466 // that would be a benefit as it would not eliminate any operations, just
30467 // perform one more step in scalar code before moving to the vector unit.
30468 if (BuildVectorSDNode *BV =
30469 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
30470 // Bail out if the vector isn't a constant.
30471 if (!BV->isConstant())
30472 return SDValue();
30473
30474 // Everything checks out. Build up the new and improved node.
30475 SDLoc DL(N);
30476 EVT IntVT = BV->getValueType(0);
30477 // Create a new constant of the appropriate type for the transformed
30478 // DAG.
30479 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
30480 // The AND node needs bitcasts to/from an integer vector type around it.
30481 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
30482 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
30483 N->getOperand(0)->getOperand(0), MaskConst);
30484 SDValue Res = DAG.getBitcast(VT, NewAnd);
30485 return Res;
30486 }
30487
30488 return SDValue();
30489 }
30490
combineUIntToFP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30491 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
30492 const X86Subtarget &Subtarget) {
30493 SDValue Op0 = N->getOperand(0);
30494 EVT VT = N->getValueType(0);
30495 EVT InVT = Op0.getValueType();
30496 EVT InSVT = InVT.getScalarType();
30497 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30498
30499 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
30500 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
30501 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30502 SDLoc dl(N);
30503 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30504 InVT.getVectorNumElements());
30505 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
30506
30507 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
30508 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
30509
30510 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30511 }
30512
30513 return SDValue();
30514 }
30515
combineSIntToFP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30516 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
30517 const X86Subtarget &Subtarget) {
30518 // First try to optimize away the conversion entirely when it's
30519 // conditionally from a constant. Vectors only.
30520 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
30521 return Res;
30522
30523 // Now move on to more general possibilities.
30524 SDValue Op0 = N->getOperand(0);
30525 EVT VT = N->getValueType(0);
30526 EVT InVT = Op0.getValueType();
30527 EVT InSVT = InVT.getScalarType();
30528
30529 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
30530 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
30531 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30532 SDLoc dl(N);
30533 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30534 InVT.getVectorNumElements());
30535 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
30536 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30537 }
30538
30539 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
30540 // a 32-bit target where SSE doesn't support i64->FP operations.
30541 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
30542 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
30543 EVT LdVT = Ld->getValueType(0);
30544
30545 // This transformation is not supported if the result type is f16 or f128.
30546 if (VT == MVT::f16 || VT == MVT::f128)
30547 return SDValue();
30548
30549 if (!Ld->isVolatile() && !VT.isVector() &&
30550 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
30551 !Subtarget.is64Bit() && LdVT == MVT::i64) {
30552 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
30553 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
30554 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
30555 return FILDChain;
30556 }
30557 }
30558 return SDValue();
30559 }
30560
30561 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
combineADC(SDNode * N,SelectionDAG & DAG,X86TargetLowering::DAGCombinerInfo & DCI)30562 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
30563 X86TargetLowering::DAGCombinerInfo &DCI) {
30564 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
30565 // the result is either zero or one (depending on the input carry bit).
30566 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
30567 if (X86::isZeroNode(N->getOperand(0)) &&
30568 X86::isZeroNode(N->getOperand(1)) &&
30569 // We don't have a good way to replace an EFLAGS use, so only do this when
30570 // dead right now.
30571 SDValue(N, 1).use_empty()) {
30572 SDLoc DL(N);
30573 EVT VT = N->getValueType(0);
30574 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
30575 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
30576 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
30577 DAG.getConstant(X86::COND_B, DL,
30578 MVT::i8),
30579 N->getOperand(2)),
30580 DAG.getConstant(1, DL, VT));
30581 return DCI.CombineTo(N, Res1, CarryOut);
30582 }
30583
30584 return SDValue();
30585 }
30586
30587 /// fold (add Y, (sete X, 0)) -> adc 0, Y
30588 /// (add Y, (setne X, 0)) -> sbb -1, Y
30589 /// (sub (sete X, 0), Y) -> sbb 0, Y
30590 /// (sub (setne X, 0), Y) -> adc -1, Y
OptimizeConditionalInDecrement(SDNode * N,SelectionDAG & DAG)30591 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
30592 SDLoc DL(N);
30593
30594 // Look through ZExts.
30595 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
30596 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
30597 return SDValue();
30598
30599 SDValue SetCC = Ext.getOperand(0);
30600 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
30601 return SDValue();
30602
30603 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
30604 if (CC != X86::COND_E && CC != X86::COND_NE)
30605 return SDValue();
30606
30607 SDValue Cmp = SetCC.getOperand(1);
30608 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
30609 !X86::isZeroNode(Cmp.getOperand(1)) ||
30610 !Cmp.getOperand(0).getValueType().isInteger())
30611 return SDValue();
30612
30613 SDValue CmpOp0 = Cmp.getOperand(0);
30614 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
30615 DAG.getConstant(1, DL, CmpOp0.getValueType()));
30616
30617 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
30618 if (CC == X86::COND_NE)
30619 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
30620 DL, OtherVal.getValueType(), OtherVal,
30621 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
30622 NewCmp);
30623 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
30624 DL, OtherVal.getValueType(), OtherVal,
30625 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
30626 }
30627
detectSADPattern(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30628 static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
30629 const X86Subtarget &Subtarget) {
30630 SDLoc DL(N);
30631 EVT VT = N->getValueType(0);
30632 SDValue Op0 = N->getOperand(0);
30633 SDValue Op1 = N->getOperand(1);
30634
30635 if (!VT.isVector() || !VT.isSimple() ||
30636 !(VT.getVectorElementType() == MVT::i32))
30637 return SDValue();
30638
30639 unsigned RegSize = 128;
30640 if (Subtarget.hasBWI())
30641 RegSize = 512;
30642 else if (Subtarget.hasAVX2())
30643 RegSize = 256;
30644
30645 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
30646 if (VT.getSizeInBits() / 4 > RegSize)
30647 return SDValue();
30648
30649 // Detect the following pattern:
30650 //
30651 // 1: %2 = zext <N x i8> %0 to <N x i32>
30652 // 2: %3 = zext <N x i8> %1 to <N x i32>
30653 // 3: %4 = sub nsw <N x i32> %2, %3
30654 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30655 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30656 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30657 // 7: %8 = add nsw <N x i32> %7, %vec.phi
30658 //
30659 // The last instruction must be a reduction add. The instructions 3-6 forms an
30660 // ABSDIFF pattern.
30661
30662 // The two operands of reduction add are from PHI and a select-op as in line 7
30663 // above.
30664 SDValue SelectOp, Phi;
30665 if (Op0.getOpcode() == ISD::VSELECT) {
30666 SelectOp = Op0;
30667 Phi = Op1;
30668 } else if (Op1.getOpcode() == ISD::VSELECT) {
30669 SelectOp = Op1;
30670 Phi = Op0;
30671 } else
30672 return SDValue();
30673
30674 // Check the condition of the select instruction is greater-than.
30675 SDValue SetCC = SelectOp->getOperand(0);
30676 if (SetCC.getOpcode() != ISD::SETCC)
30677 return SDValue();
30678 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30679 if (CC != ISD::SETGT)
30680 return SDValue();
30681
30682 Op0 = SelectOp->getOperand(1);
30683 Op1 = SelectOp->getOperand(2);
30684
30685 // The second operand of SelectOp Op1 is the negation of the first operand
30686 // Op0, which is implemented as 0 - Op0.
30687 if (!(Op1.getOpcode() == ISD::SUB &&
30688 ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
30689 Op1.getOperand(1) == Op0))
30690 return SDValue();
30691
30692 // The first operand of SetCC is the first operand of SelectOp, which is the
30693 // difference between two input vectors.
30694 if (SetCC.getOperand(0) != Op0)
30695 return SDValue();
30696
30697 // The second operand of > comparison can be either -1 or 0.
30698 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30699 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30700 return SDValue();
30701
30702 // The first operand of SelectOp is the difference between two input vectors.
30703 if (Op0.getOpcode() != ISD::SUB)
30704 return SDValue();
30705
30706 Op1 = Op0.getOperand(1);
30707 Op0 = Op0.getOperand(0);
30708
30709 // Check if the operands of the diff are zero-extended from vectors of i8.
30710 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30711 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30712 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30713 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30714 return SDValue();
30715
30716 // SAD pattern detected. Now build a SAD instruction and an addition for
30717 // reduction. Note that the number of elments of the result of SAD is less
30718 // than the number of elements of its input. Therefore, we could only update
30719 // part of elements in the reduction vector.
30720
30721 // Legalize the type of the inputs of PSADBW.
30722 EVT InVT = Op0.getOperand(0).getValueType();
30723 if (InVT.getSizeInBits() <= 128)
30724 RegSize = 128;
30725 else if (InVT.getSizeInBits() <= 256)
30726 RegSize = 256;
30727
30728 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30729 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30730 Ops[0] = Op0.getOperand(0);
30731 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30732 Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30733 Ops[0] = Op1.getOperand(0);
30734 Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30735
30736 // The output of PSADBW is a vector of i64.
30737 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30738 SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
30739
30740 // We need to turn the vector of i64 into a vector of i32.
30741 // If the reduction vector is at least as wide as the psadbw result, just
30742 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
30743 // anyway.
30744 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30745 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
30746 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
30747 else
30748 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
30749
30750 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
30751 // Update part of elements of the reduction vector. This is done by first
30752 // extracting a sub-vector from it, updating this sub-vector, and inserting
30753 // it back.
30754 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
30755 DAG.getIntPtrConstant(0, DL));
30756 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
30757 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
30758 DAG.getIntPtrConstant(0, DL));
30759 } else
30760 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
30761 }
30762
combineAdd(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30763 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
30764 const X86Subtarget &Subtarget) {
30765 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
30766 if (Flags->hasVectorReduction()) {
30767 if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
30768 return Sad;
30769 }
30770 EVT VT = N->getValueType(0);
30771 SDValue Op0 = N->getOperand(0);
30772 SDValue Op1 = N->getOperand(1);
30773
30774 // Try to synthesize horizontal adds from adds of shuffles.
30775 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30776 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30777 isHorizontalBinOp(Op0, Op1, true))
30778 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
30779
30780 return OptimizeConditionalInDecrement(N, DAG);
30781 }
30782
combineSub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30783 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
30784 const X86Subtarget &Subtarget) {
30785 SDValue Op0 = N->getOperand(0);
30786 SDValue Op1 = N->getOperand(1);
30787
30788 // X86 can't encode an immediate LHS of a sub. See if we can push the
30789 // negation into a preceding instruction.
30790 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
30791 // If the RHS of the sub is a XOR with one use and a constant, invert the
30792 // immediate. Then add one to the LHS of the sub so we can turn
30793 // X-Y -> X+~Y+1, saving one register.
30794 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
30795 isa<ConstantSDNode>(Op1.getOperand(1))) {
30796 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
30797 EVT VT = Op0.getValueType();
30798 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
30799 Op1.getOperand(0),
30800 DAG.getConstant(~XorC, SDLoc(Op1), VT));
30801 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
30802 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
30803 }
30804 }
30805
30806 // Try to synthesize horizontal adds from adds of shuffles.
30807 EVT VT = N->getValueType(0);
30808 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30809 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30810 isHorizontalBinOp(Op0, Op1, true))
30811 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
30812
30813 return OptimizeConditionalInDecrement(N, DAG);
30814 }
30815
combineVZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30816 static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
30817 TargetLowering::DAGCombinerInfo &DCI,
30818 const X86Subtarget &Subtarget) {
30819 SDLoc DL(N);
30820 MVT VT = N->getSimpleValueType(0);
30821 MVT SVT = VT.getVectorElementType();
30822 SDValue Op = N->getOperand(0);
30823 MVT OpVT = Op.getSimpleValueType();
30824 MVT OpEltVT = OpVT.getVectorElementType();
30825 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
30826
30827 // Perform any constant folding.
30828 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
30829 SmallVector<SDValue, 4> Vals;
30830 for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30831 SDValue OpElt = Op.getOperand(i);
30832 if (OpElt.getOpcode() == ISD::UNDEF) {
30833 Vals.push_back(DAG.getUNDEF(SVT));
30834 continue;
30835 }
30836 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
30837 assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
30838 Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
30839 Vals.push_back(DAG.getConstant(Cst, DL, SVT));
30840 }
30841 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
30842 }
30843
30844 // (vzext (bitcast (vzext (x)) -> (vzext x)
30845 SDValue V = peekThroughBitcasts(Op);
30846 if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
30847 MVT InnerVT = V.getSimpleValueType();
30848 MVT InnerEltVT = InnerVT.getVectorElementType();
30849
30850 // If the element sizes match exactly, we can just do one larger vzext. This
30851 // is always an exact type match as vzext operates on integer types.
30852 if (OpEltVT == InnerEltVT) {
30853 assert(OpVT == InnerVT && "Types must match for vzext!");
30854 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
30855 }
30856
30857 // The only other way we can combine them is if only a single element of the
30858 // inner vzext is used in the input to the outer vzext.
30859 if (InnerEltVT.getSizeInBits() < InputBits)
30860 return SDValue();
30861
30862 // In this case, the inner vzext is completely dead because we're going to
30863 // only look at bits inside of the low element. Just do the outer vzext on
30864 // a bitcast of the input to the inner.
30865 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
30866 }
30867
30868 // Check if we can bypass extracting and re-inserting an element of an input
30869 // vector. Essentially:
30870 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
30871 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
30872 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
30873 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
30874 SDValue ExtractedV = V.getOperand(0);
30875 SDValue OrigV = ExtractedV.getOperand(0);
30876 if (isNullConstant(ExtractedV.getOperand(1))) {
30877 MVT OrigVT = OrigV.getSimpleValueType();
30878 // Extract a subvector if necessary...
30879 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
30880 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
30881 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
30882 OrigVT.getVectorNumElements() / Ratio);
30883 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
30884 DAG.getIntPtrConstant(0, DL));
30885 }
30886 Op = DAG.getBitcast(OpVT, OrigV);
30887 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
30888 }
30889 }
30890
30891 return SDValue();
30892 }
30893
30894 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
combineLockSub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30895 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
30896 const X86Subtarget &Subtarget) {
30897 SDValue Chain = N->getOperand(0);
30898 SDValue LHS = N->getOperand(1);
30899 SDValue RHS = N->getOperand(2);
30900 MVT VT = RHS.getSimpleValueType();
30901 SDLoc DL(N);
30902
30903 auto *C = dyn_cast<ConstantSDNode>(RHS);
30904 if (!C || C->getZExtValue() != 1)
30905 return SDValue();
30906
30907 RHS = DAG.getConstant(-1, DL, VT);
30908 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
30909 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
30910 DAG.getVTList(MVT::i32, MVT::Other),
30911 {Chain, LHS, RHS}, VT, MMO);
30912 }
30913
30914 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
combineTestM(SDNode * N,SelectionDAG & DAG)30915 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
30916 SDValue Op0 = N->getOperand(0);
30917 SDValue Op1 = N->getOperand(1);
30918
30919 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
30920 return SDValue();
30921
30922 EVT VT = N->getValueType(0);
30923 SDLoc DL(N);
30924
30925 return DAG.getNode(X86ISD::TESTM, DL, VT,
30926 Op0->getOperand(0), Op0->getOperand(1));
30927 }
30928
combineVectorCompare(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30929 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
30930 const X86Subtarget &Subtarget) {
30931 MVT VT = N->getSimpleValueType(0);
30932 SDLoc DL(N);
30933
30934 if (N->getOperand(0) == N->getOperand(1)) {
30935 if (N->getOpcode() == X86ISD::PCMPEQ)
30936 return getOnesVector(VT, Subtarget, DAG, DL);
30937 if (N->getOpcode() == X86ISD::PCMPGT)
30938 return getZeroVector(VT, Subtarget, DAG, DL);
30939 }
30940
30941 return SDValue();
30942 }
30943
30944
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const30945 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
30946 DAGCombinerInfo &DCI) const {
30947 SelectionDAG &DAG = DCI.DAG;
30948 switch (N->getOpcode()) {
30949 default: break;
30950 case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
30951 case ISD::VSELECT:
30952 case ISD::SELECT:
30953 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
30954 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
30955 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
30956 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
30957 case ISD::SUB: return combineSub(N, DAG, Subtarget);
30958 case X86ISD::ADC: return combineADC(N, DAG, DCI);
30959 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
30960 case ISD::SHL:
30961 case ISD::SRA:
30962 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
30963 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
30964 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
30965 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
30966 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
30967 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
30968 case ISD::STORE: return combineStore(N, DAG, Subtarget);
30969 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
30970 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
30971 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
30972 case ISD::FADD:
30973 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
30974 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
30975 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
30976 case X86ISD::FXOR:
30977 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
30978 case X86ISD::FMIN:
30979 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
30980 case ISD::FMINNUM:
30981 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
30982 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
30983 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
30984 case X86ISD::BT: return combineBT(N, DAG, DCI);
30985 case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG);
30986 case ISD::ANY_EXTEND:
30987 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
30988 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
30989 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
30990 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
30991 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
30992 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
30993 case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget);
30994 case X86ISD::SHUFP: // Handle all target specific shuffles
30995 case X86ISD::INSERTPS:
30996 case X86ISD::PALIGNR:
30997 case X86ISD::VSHLDQ:
30998 case X86ISD::VSRLDQ:
30999 case X86ISD::BLENDI:
31000 case X86ISD::UNPCKH:
31001 case X86ISD::UNPCKL:
31002 case X86ISD::MOVHLPS:
31003 case X86ISD::MOVLHPS:
31004 case X86ISD::PSHUFB:
31005 case X86ISD::PSHUFD:
31006 case X86ISD::PSHUFHW:
31007 case X86ISD::PSHUFLW:
31008 case X86ISD::MOVSHDUP:
31009 case X86ISD::MOVSLDUP:
31010 case X86ISD::MOVDDUP:
31011 case X86ISD::MOVSS:
31012 case X86ISD::MOVSD:
31013 case X86ISD::VPPERM:
31014 case X86ISD::VPERMV:
31015 case X86ISD::VPERMV3:
31016 case X86ISD::VPERMIL2:
31017 case X86ISD::VPERMILPI:
31018 case X86ISD::VPERMILPV:
31019 case X86ISD::VPERM2X128:
31020 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
31021 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
31022 case ISD::MGATHER:
31023 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
31024 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
31025 case X86ISD::TESTM: return combineTestM(N, DAG);
31026 case X86ISD::PCMPEQ:
31027 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
31028 }
31029
31030 return SDValue();
31031 }
31032
31033 /// Return true if the target has native support for the specified value type
31034 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
31035 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
31036 /// some i16 instructions are slow.
isTypeDesirableForOp(unsigned Opc,EVT VT) const31037 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
31038 if (!isTypeLegal(VT))
31039 return false;
31040 if (VT != MVT::i16)
31041 return true;
31042
31043 switch (Opc) {
31044 default:
31045 return true;
31046 case ISD::LOAD:
31047 case ISD::SIGN_EXTEND:
31048 case ISD::ZERO_EXTEND:
31049 case ISD::ANY_EXTEND:
31050 case ISD::SHL:
31051 case ISD::SRL:
31052 case ISD::SUB:
31053 case ISD::ADD:
31054 case ISD::MUL:
31055 case ISD::AND:
31056 case ISD::OR:
31057 case ISD::XOR:
31058 return false;
31059 }
31060 }
31061
31062 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
31063 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
31064 /// we don't adjust the stack we clobber the first frame index.
31065 /// See X86InstrInfo::copyPhysReg.
hasCopyImplyingStackAdjustment(MachineFunction * MF) const31066 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
31067 MachineFunction *MF) const {
31068 const MachineRegisterInfo &MRI = MF->getRegInfo();
31069
31070 return any_of(MRI.reg_instructions(X86::EFLAGS),
31071 [](const MachineInstr &RI) { return RI.isCopy(); });
31072 }
31073
31074 /// This method query the target whether it is beneficial for dag combiner to
31075 /// promote the specified node. If true, it should return the desired promotion
31076 /// type by reference.
IsDesirableToPromoteOp(SDValue Op,EVT & PVT) const31077 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
31078 EVT VT = Op.getValueType();
31079 if (VT != MVT::i16)
31080 return false;
31081
31082 bool Promote = false;
31083 bool Commute = false;
31084 switch (Op.getOpcode()) {
31085 default: break;
31086 case ISD::SIGN_EXTEND:
31087 case ISD::ZERO_EXTEND:
31088 case ISD::ANY_EXTEND:
31089 Promote = true;
31090 break;
31091 case ISD::SHL:
31092 case ISD::SRL: {
31093 SDValue N0 = Op.getOperand(0);
31094 // Look out for (store (shl (load), x)).
31095 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
31096 return false;
31097 Promote = true;
31098 break;
31099 }
31100 case ISD::ADD:
31101 case ISD::MUL:
31102 case ISD::AND:
31103 case ISD::OR:
31104 case ISD::XOR:
31105 Commute = true;
31106 // fallthrough
31107 case ISD::SUB: {
31108 SDValue N0 = Op.getOperand(0);
31109 SDValue N1 = Op.getOperand(1);
31110 if (!Commute && MayFoldLoad(N1))
31111 return false;
31112 // Avoid disabling potential load folding opportunities.
31113 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
31114 return false;
31115 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
31116 return false;
31117 Promote = true;
31118 }
31119 }
31120
31121 PVT = MVT::i32;
31122 return Promote;
31123 }
31124
31125 //===----------------------------------------------------------------------===//
31126 // X86 Inline Assembly Support
31127 //===----------------------------------------------------------------------===//
31128
31129 // Helper to match a string separated by whitespace.
matchAsm(StringRef S,ArrayRef<const char * > Pieces)31130 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
31131 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
31132
31133 for (StringRef Piece : Pieces) {
31134 if (!S.startswith(Piece)) // Check if the piece matches.
31135 return false;
31136
31137 S = S.substr(Piece.size());
31138 StringRef::size_type Pos = S.find_first_not_of(" \t");
31139 if (Pos == 0) // We matched a prefix.
31140 return false;
31141
31142 S = S.substr(Pos);
31143 }
31144
31145 return S.empty();
31146 }
31147
clobbersFlagRegisters(const SmallVector<StringRef,4> & AsmPieces)31148 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
31149
31150 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
31151 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
31152 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
31153 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
31154
31155 if (AsmPieces.size() == 3)
31156 return true;
31157 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
31158 return true;
31159 }
31160 }
31161 return false;
31162 }
31163
ExpandInlineAsm(CallInst * CI) const31164 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
31165 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
31166
31167 const std::string &AsmStr = IA->getAsmString();
31168
31169 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
31170 if (!Ty || Ty->getBitWidth() % 16 != 0)
31171 return false;
31172
31173 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
31174 SmallVector<StringRef, 4> AsmPieces;
31175 SplitString(AsmStr, AsmPieces, ";\n");
31176
31177 switch (AsmPieces.size()) {
31178 default: return false;
31179 case 1:
31180 // FIXME: this should verify that we are targeting a 486 or better. If not,
31181 // we will turn this bswap into something that will be lowered to logical
31182 // ops instead of emitting the bswap asm. For now, we don't support 486 or
31183 // lower so don't worry about this.
31184 // bswap $0
31185 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
31186 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
31187 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
31188 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
31189 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
31190 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
31191 // No need to check constraints, nothing other than the equivalent of
31192 // "=r,0" would be valid here.
31193 return IntrinsicLowering::LowerToByteSwap(CI);
31194 }
31195
31196 // rorw $$8, ${0:w} --> llvm.bswap.i16
31197 if (CI->getType()->isIntegerTy(16) &&
31198 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31199 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
31200 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
31201 AsmPieces.clear();
31202 StringRef ConstraintsStr = IA->getConstraintString();
31203 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31204 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31205 if (clobbersFlagRegisters(AsmPieces))
31206 return IntrinsicLowering::LowerToByteSwap(CI);
31207 }
31208 break;
31209 case 3:
31210 if (CI->getType()->isIntegerTy(32) &&
31211 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31212 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
31213 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
31214 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
31215 AsmPieces.clear();
31216 StringRef ConstraintsStr = IA->getConstraintString();
31217 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31218 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31219 if (clobbersFlagRegisters(AsmPieces))
31220 return IntrinsicLowering::LowerToByteSwap(CI);
31221 }
31222
31223 if (CI->getType()->isIntegerTy(64)) {
31224 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
31225 if (Constraints.size() >= 2 &&
31226 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
31227 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
31228 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
31229 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
31230 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
31231 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
31232 return IntrinsicLowering::LowerToByteSwap(CI);
31233 }
31234 }
31235 break;
31236 }
31237 return false;
31238 }
31239
31240 /// Given a constraint letter, return the type of constraint for this target.
31241 X86TargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const31242 X86TargetLowering::getConstraintType(StringRef Constraint) const {
31243 if (Constraint.size() == 1) {
31244 switch (Constraint[0]) {
31245 case 'R':
31246 case 'q':
31247 case 'Q':
31248 case 'f':
31249 case 't':
31250 case 'u':
31251 case 'y':
31252 case 'x':
31253 case 'Y':
31254 case 'l':
31255 return C_RegisterClass;
31256 case 'a':
31257 case 'b':
31258 case 'c':
31259 case 'd':
31260 case 'S':
31261 case 'D':
31262 case 'A':
31263 return C_Register;
31264 case 'I':
31265 case 'J':
31266 case 'K':
31267 case 'L':
31268 case 'M':
31269 case 'N':
31270 case 'G':
31271 case 'C':
31272 case 'e':
31273 case 'Z':
31274 return C_Other;
31275 default:
31276 break;
31277 }
31278 }
31279 return TargetLowering::getConstraintType(Constraint);
31280 }
31281
31282 /// Examine constraint type and operand type and determine a weight value.
31283 /// This object must already have been set up with the operand type
31284 /// and the current alternative constraint selected.
31285 TargetLowering::ConstraintWeight
getSingleConstraintMatchWeight(AsmOperandInfo & info,const char * constraint) const31286 X86TargetLowering::getSingleConstraintMatchWeight(
31287 AsmOperandInfo &info, const char *constraint) const {
31288 ConstraintWeight weight = CW_Invalid;
31289 Value *CallOperandVal = info.CallOperandVal;
31290 // If we don't have a value, we can't do a match,
31291 // but allow it at the lowest weight.
31292 if (!CallOperandVal)
31293 return CW_Default;
31294 Type *type = CallOperandVal->getType();
31295 // Look at the constraint type.
31296 switch (*constraint) {
31297 default:
31298 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
31299 case 'R':
31300 case 'q':
31301 case 'Q':
31302 case 'a':
31303 case 'b':
31304 case 'c':
31305 case 'd':
31306 case 'S':
31307 case 'D':
31308 case 'A':
31309 if (CallOperandVal->getType()->isIntegerTy())
31310 weight = CW_SpecificReg;
31311 break;
31312 case 'f':
31313 case 't':
31314 case 'u':
31315 if (type->isFloatingPointTy())
31316 weight = CW_SpecificReg;
31317 break;
31318 case 'y':
31319 if (type->isX86_MMXTy() && Subtarget.hasMMX())
31320 weight = CW_SpecificReg;
31321 break;
31322 case 'x':
31323 case 'Y':
31324 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
31325 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
31326 weight = CW_Register;
31327 break;
31328 case 'I':
31329 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
31330 if (C->getZExtValue() <= 31)
31331 weight = CW_Constant;
31332 }
31333 break;
31334 case 'J':
31335 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31336 if (C->getZExtValue() <= 63)
31337 weight = CW_Constant;
31338 }
31339 break;
31340 case 'K':
31341 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31342 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
31343 weight = CW_Constant;
31344 }
31345 break;
31346 case 'L':
31347 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31348 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
31349 weight = CW_Constant;
31350 }
31351 break;
31352 case 'M':
31353 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31354 if (C->getZExtValue() <= 3)
31355 weight = CW_Constant;
31356 }
31357 break;
31358 case 'N':
31359 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31360 if (C->getZExtValue() <= 0xff)
31361 weight = CW_Constant;
31362 }
31363 break;
31364 case 'G':
31365 case 'C':
31366 if (isa<ConstantFP>(CallOperandVal)) {
31367 weight = CW_Constant;
31368 }
31369 break;
31370 case 'e':
31371 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31372 if ((C->getSExtValue() >= -0x80000000LL) &&
31373 (C->getSExtValue() <= 0x7fffffffLL))
31374 weight = CW_Constant;
31375 }
31376 break;
31377 case 'Z':
31378 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31379 if (C->getZExtValue() <= 0xffffffff)
31380 weight = CW_Constant;
31381 }
31382 break;
31383 }
31384 return weight;
31385 }
31386
31387 /// Try to replace an X constraint, which matches anything, with another that
31388 /// has more specific requirements based on the type of the corresponding
31389 /// operand.
31390 const char *X86TargetLowering::
LowerXConstraint(EVT ConstraintVT) const31391 LowerXConstraint(EVT ConstraintVT) const {
31392 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
31393 // 'f' like normal targets.
31394 if (ConstraintVT.isFloatingPoint()) {
31395 if (Subtarget.hasSSE2())
31396 return "Y";
31397 if (Subtarget.hasSSE1())
31398 return "x";
31399 }
31400
31401 return TargetLowering::LowerXConstraint(ConstraintVT);
31402 }
31403
31404 /// Lower the specified operand into the Ops vector.
31405 /// If it is invalid, don't add anything to Ops.
LowerAsmOperandForConstraint(SDValue Op,std::string & Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const31406 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
31407 std::string &Constraint,
31408 std::vector<SDValue>&Ops,
31409 SelectionDAG &DAG) const {
31410 SDValue Result;
31411
31412 // Only support length 1 constraints for now.
31413 if (Constraint.length() > 1) return;
31414
31415 char ConstraintLetter = Constraint[0];
31416 switch (ConstraintLetter) {
31417 default: break;
31418 case 'I':
31419 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31420 if (C->getZExtValue() <= 31) {
31421 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31422 Op.getValueType());
31423 break;
31424 }
31425 }
31426 return;
31427 case 'J':
31428 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31429 if (C->getZExtValue() <= 63) {
31430 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31431 Op.getValueType());
31432 break;
31433 }
31434 }
31435 return;
31436 case 'K':
31437 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31438 if (isInt<8>(C->getSExtValue())) {
31439 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31440 Op.getValueType());
31441 break;
31442 }
31443 }
31444 return;
31445 case 'L':
31446 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31447 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
31448 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
31449 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
31450 Op.getValueType());
31451 break;
31452 }
31453 }
31454 return;
31455 case 'M':
31456 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31457 if (C->getZExtValue() <= 3) {
31458 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31459 Op.getValueType());
31460 break;
31461 }
31462 }
31463 return;
31464 case 'N':
31465 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31466 if (C->getZExtValue() <= 255) {
31467 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31468 Op.getValueType());
31469 break;
31470 }
31471 }
31472 return;
31473 case 'O':
31474 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31475 if (C->getZExtValue() <= 127) {
31476 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31477 Op.getValueType());
31478 break;
31479 }
31480 }
31481 return;
31482 case 'e': {
31483 // 32-bit signed value
31484 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31485 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31486 C->getSExtValue())) {
31487 // Widen to 64 bits here to get it sign extended.
31488 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
31489 break;
31490 }
31491 // FIXME gcc accepts some relocatable values here too, but only in certain
31492 // memory models; it's complicated.
31493 }
31494 return;
31495 }
31496 case 'Z': {
31497 // 32-bit unsigned value
31498 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31499 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31500 C->getZExtValue())) {
31501 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31502 Op.getValueType());
31503 break;
31504 }
31505 }
31506 // FIXME gcc accepts some relocatable values here too, but only in certain
31507 // memory models; it's complicated.
31508 return;
31509 }
31510 case 'i': {
31511 // Literal immediates are always ok.
31512 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
31513 // Widen to 64 bits here to get it sign extended.
31514 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
31515 break;
31516 }
31517
31518 // In any sort of PIC mode addresses need to be computed at runtime by
31519 // adding in a register or some sort of table lookup. These can't
31520 // be used as immediates.
31521 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
31522 return;
31523
31524 // If we are in non-pic codegen mode, we allow the address of a global (with
31525 // an optional displacement) to be used with 'i'.
31526 GlobalAddressSDNode *GA = nullptr;
31527 int64_t Offset = 0;
31528
31529 // Match either (GA), (GA+C), (GA+C1+C2), etc.
31530 while (1) {
31531 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
31532 Offset += GA->getOffset();
31533 break;
31534 } else if (Op.getOpcode() == ISD::ADD) {
31535 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31536 Offset += C->getZExtValue();
31537 Op = Op.getOperand(0);
31538 continue;
31539 }
31540 } else if (Op.getOpcode() == ISD::SUB) {
31541 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31542 Offset += -C->getZExtValue();
31543 Op = Op.getOperand(0);
31544 continue;
31545 }
31546 }
31547
31548 // Otherwise, this isn't something we can handle, reject it.
31549 return;
31550 }
31551
31552 const GlobalValue *GV = GA->getGlobal();
31553 // If we require an extra load to get this address, as in PIC mode, we
31554 // can't accept it.
31555 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
31556 return;
31557
31558 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
31559 GA->getValueType(0), Offset);
31560 break;
31561 }
31562 }
31563
31564 if (Result.getNode()) {
31565 Ops.push_back(Result);
31566 return;
31567 }
31568 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
31569 }
31570
31571 /// Check if \p RC is a general purpose register class.
31572 /// I.e., GR* or one of their variant.
isGRClass(const TargetRegisterClass & RC)31573 static bool isGRClass(const TargetRegisterClass &RC) {
31574 switch (RC.getID()) {
31575 case X86::GR8RegClassID:
31576 case X86::GR8_ABCD_LRegClassID:
31577 case X86::GR8_ABCD_HRegClassID:
31578 case X86::GR8_NOREXRegClassID:
31579 case X86::GR16RegClassID:
31580 case X86::GR16_ABCDRegClassID:
31581 case X86::GR16_NOREXRegClassID:
31582 case X86::GR32RegClassID:
31583 case X86::GR32_ABCDRegClassID:
31584 case X86::GR32_TCRegClassID:
31585 case X86::GR32_NOREXRegClassID:
31586 case X86::GR32_NOAXRegClassID:
31587 case X86::GR32_NOSPRegClassID:
31588 case X86::GR32_NOREX_NOSPRegClassID:
31589 case X86::GR32_ADRegClassID:
31590 case X86::GR64RegClassID:
31591 case X86::GR64_ABCDRegClassID:
31592 case X86::GR64_TCRegClassID:
31593 case X86::GR64_TCW64RegClassID:
31594 case X86::GR64_NOREXRegClassID:
31595 case X86::GR64_NOSPRegClassID:
31596 case X86::GR64_NOREX_NOSPRegClassID:
31597 case X86::LOW32_ADDR_ACCESSRegClassID:
31598 case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
31599 return true;
31600 default:
31601 return false;
31602 }
31603 }
31604
31605 /// Check if \p RC is a vector register class.
31606 /// I.e., FR* / VR* or one of their variant.
isFRClass(const TargetRegisterClass & RC)31607 static bool isFRClass(const TargetRegisterClass &RC) {
31608 switch (RC.getID()) {
31609 case X86::FR32RegClassID:
31610 case X86::FR32XRegClassID:
31611 case X86::FR64RegClassID:
31612 case X86::FR64XRegClassID:
31613 case X86::FR128RegClassID:
31614 case X86::VR64RegClassID:
31615 case X86::VR128RegClassID:
31616 case X86::VR128LRegClassID:
31617 case X86::VR128HRegClassID:
31618 case X86::VR128XRegClassID:
31619 case X86::VR256RegClassID:
31620 case X86::VR256LRegClassID:
31621 case X86::VR256HRegClassID:
31622 case X86::VR256XRegClassID:
31623 case X86::VR512RegClassID:
31624 return true;
31625 default:
31626 return false;
31627 }
31628 }
31629
31630 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI,StringRef Constraint,MVT VT) const31631 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
31632 StringRef Constraint,
31633 MVT VT) const {
31634 // First, see if this is a constraint that directly corresponds to an LLVM
31635 // register class.
31636 if (Constraint.size() == 1) {
31637 // GCC Constraint Letters
31638 switch (Constraint[0]) {
31639 default: break;
31640 // TODO: Slight differences here in allocation order and leaving
31641 // RIP in the class. Do they matter any more here than they do
31642 // in the normal allocation?
31643 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
31644 if (Subtarget.is64Bit()) {
31645 if (VT == MVT::i32 || VT == MVT::f32)
31646 return std::make_pair(0U, &X86::GR32RegClass);
31647 if (VT == MVT::i16)
31648 return std::make_pair(0U, &X86::GR16RegClass);
31649 if (VT == MVT::i8 || VT == MVT::i1)
31650 return std::make_pair(0U, &X86::GR8RegClass);
31651 if (VT == MVT::i64 || VT == MVT::f64)
31652 return std::make_pair(0U, &X86::GR64RegClass);
31653 break;
31654 }
31655 // 32-bit fallthrough
31656 case 'Q': // Q_REGS
31657 if (VT == MVT::i32 || VT == MVT::f32)
31658 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
31659 if (VT == MVT::i16)
31660 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
31661 if (VT == MVT::i8 || VT == MVT::i1)
31662 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
31663 if (VT == MVT::i64)
31664 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
31665 break;
31666 case 'r': // GENERAL_REGS
31667 case 'l': // INDEX_REGS
31668 if (VT == MVT::i8 || VT == MVT::i1)
31669 return std::make_pair(0U, &X86::GR8RegClass);
31670 if (VT == MVT::i16)
31671 return std::make_pair(0U, &X86::GR16RegClass);
31672 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
31673 return std::make_pair(0U, &X86::GR32RegClass);
31674 return std::make_pair(0U, &X86::GR64RegClass);
31675 case 'R': // LEGACY_REGS
31676 if (VT == MVT::i8 || VT == MVT::i1)
31677 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
31678 if (VT == MVT::i16)
31679 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
31680 if (VT == MVT::i32 || !Subtarget.is64Bit())
31681 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
31682 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
31683 case 'f': // FP Stack registers.
31684 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
31685 // value to the correct fpstack register class.
31686 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
31687 return std::make_pair(0U, &X86::RFP32RegClass);
31688 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
31689 return std::make_pair(0U, &X86::RFP64RegClass);
31690 return std::make_pair(0U, &X86::RFP80RegClass);
31691 case 'y': // MMX_REGS if MMX allowed.
31692 if (!Subtarget.hasMMX()) break;
31693 return std::make_pair(0U, &X86::VR64RegClass);
31694 case 'Y': // SSE_REGS if SSE2 allowed
31695 if (!Subtarget.hasSSE2()) break;
31696 // FALL THROUGH.
31697 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
31698 if (!Subtarget.hasSSE1()) break;
31699
31700 switch (VT.SimpleTy) {
31701 default: break;
31702 // Scalar SSE types.
31703 case MVT::f32:
31704 case MVT::i32:
31705 return std::make_pair(0U, &X86::FR32RegClass);
31706 case MVT::f64:
31707 case MVT::i64:
31708 return std::make_pair(0U, &X86::FR64RegClass);
31709 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31710 // Vector types.
31711 case MVT::v16i8:
31712 case MVT::v8i16:
31713 case MVT::v4i32:
31714 case MVT::v2i64:
31715 case MVT::v4f32:
31716 case MVT::v2f64:
31717 return std::make_pair(0U, &X86::VR128RegClass);
31718 // AVX types.
31719 case MVT::v32i8:
31720 case MVT::v16i16:
31721 case MVT::v8i32:
31722 case MVT::v4i64:
31723 case MVT::v8f32:
31724 case MVT::v4f64:
31725 return std::make_pair(0U, &X86::VR256RegClass);
31726 case MVT::v8f64:
31727 case MVT::v16f32:
31728 case MVT::v16i32:
31729 case MVT::v8i64:
31730 return std::make_pair(0U, &X86::VR512RegClass);
31731 }
31732 break;
31733 }
31734 }
31735
31736 // Use the default implementation in TargetLowering to convert the register
31737 // constraint into a member of a register class.
31738 std::pair<unsigned, const TargetRegisterClass*> Res;
31739 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
31740
31741 // Not found as a standard register?
31742 if (!Res.second) {
31743 // Map st(0) -> st(7) -> ST0
31744 if (Constraint.size() == 7 && Constraint[0] == '{' &&
31745 tolower(Constraint[1]) == 's' &&
31746 tolower(Constraint[2]) == 't' &&
31747 Constraint[3] == '(' &&
31748 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
31749 Constraint[5] == ')' &&
31750 Constraint[6] == '}') {
31751
31752 Res.first = X86::FP0+Constraint[4]-'0';
31753 Res.second = &X86::RFP80RegClass;
31754 return Res;
31755 }
31756
31757 // GCC allows "st(0)" to be called just plain "st".
31758 if (StringRef("{st}").equals_lower(Constraint)) {
31759 Res.first = X86::FP0;
31760 Res.second = &X86::RFP80RegClass;
31761 return Res;
31762 }
31763
31764 // flags -> EFLAGS
31765 if (StringRef("{flags}").equals_lower(Constraint)) {
31766 Res.first = X86::EFLAGS;
31767 Res.second = &X86::CCRRegClass;
31768 return Res;
31769 }
31770
31771 // 'A' means EAX + EDX.
31772 if (Constraint == "A") {
31773 Res.first = X86::EAX;
31774 Res.second = &X86::GR32_ADRegClass;
31775 return Res;
31776 }
31777 return Res;
31778 }
31779
31780 // Otherwise, check to see if this is a register class of the wrong value
31781 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
31782 // turn into {ax},{dx}.
31783 // MVT::Other is used to specify clobber names.
31784 if (Res.second->hasType(VT) || VT == MVT::Other)
31785 return Res; // Correct type already, nothing to do.
31786
31787 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
31788 // return "eax". This should even work for things like getting 64bit integer
31789 // registers when given an f64 type.
31790 const TargetRegisterClass *Class = Res.second;
31791 // The generic code will match the first register class that contains the
31792 // given register. Thus, based on the ordering of the tablegened file,
31793 // the "plain" GR classes might not come first.
31794 // Therefore, use a helper method.
31795 if (isGRClass(*Class)) {
31796 unsigned Size = VT.getSizeInBits();
31797 if (Size == 1) Size = 8;
31798 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
31799 if (DestReg > 0) {
31800 Res.first = DestReg;
31801 Res.second = Size == 8 ? &X86::GR8RegClass
31802 : Size == 16 ? &X86::GR16RegClass
31803 : Size == 32 ? &X86::GR32RegClass
31804 : &X86::GR64RegClass;
31805 assert(Res.second->contains(Res.first) && "Register in register class");
31806 } else {
31807 // No register found/type mismatch.
31808 Res.first = 0;
31809 Res.second = nullptr;
31810 }
31811 } else if (isFRClass(*Class)) {
31812 // Handle references to XMM physical registers that got mapped into the
31813 // wrong class. This can happen with constraints like {xmm0} where the
31814 // target independent register mapper will just pick the first match it can
31815 // find, ignoring the required type.
31816
31817 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31818 if (VT == MVT::f32 || VT == MVT::i32)
31819 Res.second = &X86::FR32RegClass;
31820 else if (VT == MVT::f64 || VT == MVT::i64)
31821 Res.second = &X86::FR64RegClass;
31822 else if (X86::VR128RegClass.hasType(VT))
31823 Res.second = &X86::VR128RegClass;
31824 else if (X86::VR256RegClass.hasType(VT))
31825 Res.second = &X86::VR256RegClass;
31826 else if (X86::VR512RegClass.hasType(VT))
31827 Res.second = &X86::VR512RegClass;
31828 else {
31829 // Type mismatch and not a clobber: Return an error;
31830 Res.first = 0;
31831 Res.second = nullptr;
31832 }
31833 }
31834
31835 return Res;
31836 }
31837
getScalingFactorCost(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS) const31838 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
31839 const AddrMode &AM, Type *Ty,
31840 unsigned AS) const {
31841 // Scaling factors are not free at all.
31842 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
31843 // will take 2 allocations in the out of order engine instead of 1
31844 // for plain addressing mode, i.e. inst (reg1).
31845 // E.g.,
31846 // vaddps (%rsi,%drx), %ymm0, %ymm1
31847 // Requires two allocations (one for the load, one for the computation)
31848 // whereas:
31849 // vaddps (%rsi), %ymm0, %ymm1
31850 // Requires just 1 allocation, i.e., freeing allocations for other operations
31851 // and having less micro operations to execute.
31852 //
31853 // For some X86 architectures, this is even worse because for instance for
31854 // stores, the complex addressing mode forces the instruction to use the
31855 // "load" ports instead of the dedicated "store" port.
31856 // E.g., on Haswell:
31857 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
31858 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
31859 if (isLegalAddressingMode(DL, AM, Ty, AS))
31860 // Scale represents reg2 * scale, thus account for 1
31861 // as soon as we use a second register.
31862 return AM.Scale != 0;
31863 return -1;
31864 }
31865
isIntDivCheap(EVT VT,AttributeSet Attr) const31866 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
31867 // Integer division on x86 is expensive. However, when aggressively optimizing
31868 // for code size, we prefer to use a div instruction, as it is usually smaller
31869 // than the alternative sequence.
31870 // The exception to this is vector division. Since x86 doesn't have vector
31871 // integer division, leaving the division as-is is a loss even in terms of
31872 // size, because it will have to be scalarized, while the alternative code
31873 // sequence can be performed in vector form.
31874 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
31875 Attribute::MinSize);
31876 return OptSize && !VT.isVector();
31877 }
31878
initializeSplitCSR(MachineBasicBlock * Entry) const31879 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
31880 if (!Subtarget.is64Bit())
31881 return;
31882
31883 // Update IsSplitCSR in X86MachineFunctionInfo.
31884 X86MachineFunctionInfo *AFI =
31885 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
31886 AFI->setIsSplitCSR(true);
31887 }
31888
insertCopiesSplitCSR(MachineBasicBlock * Entry,const SmallVectorImpl<MachineBasicBlock * > & Exits) const31889 void X86TargetLowering::insertCopiesSplitCSR(
31890 MachineBasicBlock *Entry,
31891 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
31892 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
31893 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
31894 if (!IStart)
31895 return;
31896
31897 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31898 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
31899 MachineBasicBlock::iterator MBBI = Entry->begin();
31900 for (const MCPhysReg *I = IStart; *I; ++I) {
31901 const TargetRegisterClass *RC = nullptr;
31902 if (X86::GR64RegClass.contains(*I))
31903 RC = &X86::GR64RegClass;
31904 else
31905 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
31906
31907 unsigned NewVR = MRI->createVirtualRegister(RC);
31908 // Create copy from CSR to a virtual register.
31909 // FIXME: this currently does not emit CFI pseudo-instructions, it works
31910 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
31911 // nounwind. If we want to generalize this later, we may need to emit
31912 // CFI pseudo-instructions.
31913 assert(Entry->getParent()->getFunction()->hasFnAttribute(
31914 Attribute::NoUnwind) &&
31915 "Function should be nounwind in insertCopiesSplitCSR!");
31916 Entry->addLiveIn(*I);
31917 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
31918 .addReg(*I);
31919
31920 // Insert the copy-back instructions right before the terminator.
31921 for (auto *Exit : Exits)
31922 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
31923 TII->get(TargetOpcode::COPY), *I)
31924 .addReg(NewVR);
31925 }
31926 }
31927