1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
2 //
3 // The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the TargetLoweringX8632 class, which consists almost
12 /// entirely of the lowering sequence for each high-level instruction.
13 ///
14 //===----------------------------------------------------------------------===//
15
16 #include "IceTargetLoweringX8632.h"
17
18 #include "IceCfg.h"
19 #include "IceCfgNode.h"
20 #include "IceClFlags.h"
21 #include "IceDefs.h"
22 #include "IceELFObjectWriter.h"
23 #include "IceGlobalInits.h"
24 #include "IceInstVarIter.h"
25 #include "IceInstX8632.h"
26 #include "IceLiveness.h"
27 #include "IceOperand.h"
28 #include "IcePhiLoweringImpl.h"
29 #include "IceTargetLoweringX8632.def"
30 #include "IceUtils.h"
31 #include "IceVariableSplitting.h"
32
33 #include "llvm/Support/MathExtras.h"
34
35 #include <stack>
36
37 #if defined(_WIN32)
38 extern "C" void _chkstk();
39 #endif
40
41 namespace X8632 {
42
createTargetLowering(::Ice::Cfg * Func)43 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
44 return ::Ice::X8632::TargetX8632::create(Func);
45 }
46
47 std::unique_ptr<::Ice::TargetDataLowering>
createTargetDataLowering(::Ice::GlobalContext * Ctx)48 createTargetDataLowering(::Ice::GlobalContext *Ctx) {
49 return ::Ice::X8632::TargetDataX8632::create(Ctx);
50 }
51
52 std::unique_ptr<::Ice::TargetHeaderLowering>
createTargetHeaderLowering(::Ice::GlobalContext * Ctx)53 createTargetHeaderLowering(::Ice::GlobalContext *Ctx) {
54 return ::Ice::X8632::TargetHeaderX86::create(Ctx);
55 }
56
staticInit(::Ice::GlobalContext * Ctx)57 void staticInit(::Ice::GlobalContext *Ctx) {
58 ::Ice::X8632::TargetX8632::staticInit(Ctx);
59 }
60
shouldBePooled(const class::Ice::Constant * C)61 bool shouldBePooled(const class ::Ice::Constant *C) {
62 return ::Ice::X8632::TargetX8632::shouldBePooled(C);
63 }
64
getPointerType()65 ::Ice::Type getPointerType() { return ::Ice::Type::IceType_i32; }
66
67 } // end of namespace X8632
68
69 namespace Ice {
70 namespace X8632 {
71
72 /// The number of bits in a byte
73 static constexpr uint32_t X86_CHAR_BIT = 8;
74 /// Size of the return address on the stack
75 static constexpr uint32_t X86_RET_IP_SIZE_BYTES = 4;
76
77 /// \name Limits for unrolling memory intrinsics.
78 /// @{
79 static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
80 static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
81 static constexpr uint32_t MEMSET_UNROLL_LIMIT = 8;
82 /// @}
83
BoolFoldingEntry(Inst * I)84 BoolFoldingEntry::BoolFoldingEntry(Inst *I)
85 : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)) {}
86
87 BoolFolding::BoolFoldingProducerKind
getProducerKind(const Inst * Instr)88 BoolFolding::getProducerKind(const Inst *Instr) {
89 if (llvm::isa<InstIcmp>(Instr)) {
90 if (Instr->getSrc(0)->getType() != IceType_i64)
91 return PK_Icmp32;
92 return PK_Icmp64;
93 }
94 if (llvm::isa<InstFcmp>(Instr))
95 return PK_Fcmp;
96 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
97 if (Arith->getSrc(0)->getType() != IceType_i64) {
98 switch (Arith->getOp()) {
99 default:
100 return PK_None;
101 case InstArithmetic::And:
102 case InstArithmetic::Or:
103 return PK_Arith;
104 }
105 }
106 }
107 return PK_None; // TODO(stichnot): remove this
108
109 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
110 switch (Cast->getCastKind()) {
111 default:
112 return PK_None;
113 case InstCast::Trunc:
114 return PK_Trunc;
115 }
116 }
117 return PK_None;
118 }
119
120 BoolFolding::BoolFoldingConsumerKind
getConsumerKind(const Inst * Instr)121 BoolFolding::getConsumerKind(const Inst *Instr) {
122 if (llvm::isa<InstBr>(Instr))
123 return CK_Br;
124 if (llvm::isa<InstSelect>(Instr))
125 return CK_Select;
126 return CK_None; // TODO(stichnot): remove this
127
128 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
129 switch (Cast->getCastKind()) {
130 default:
131 return CK_None;
132 case InstCast::Sext:
133 return CK_Sext;
134 case InstCast::Zext:
135 return CK_Zext;
136 }
137 }
138 return CK_None;
139 }
140
141 /// Returns true if the producing instruction has a "complex" lowering sequence.
142 /// This generally means that its lowering sequence requires more than one
143 /// conditional branch, namely 64-bit integer compares and some floating-point
144 /// compares. When this is true, and there is more than one consumer, we prefer
145 /// to disable the folding optimization because it minimizes branches.
146
hasComplexLowering(const Inst * Instr)147 bool BoolFolding::hasComplexLowering(const Inst *Instr) {
148 switch (getProducerKind(Instr)) {
149 default:
150 return false;
151 case PK_Icmp64:
152 return true;
153 case PK_Fcmp:
154 return TargetX8632::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()]
155 .C2 != CondX86::Br_None;
156 }
157 }
158
isValidFolding(BoolFolding::BoolFoldingProducerKind ProducerKind,BoolFolding::BoolFoldingConsumerKind ConsumerKind)159 bool BoolFolding::isValidFolding(
160 BoolFolding::BoolFoldingProducerKind ProducerKind,
161 BoolFolding::BoolFoldingConsumerKind ConsumerKind) {
162 switch (ProducerKind) {
163 default:
164 return false;
165 case PK_Icmp32:
166 case PK_Icmp64:
167 case PK_Fcmp:
168 return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
169 case PK_Arith:
170 return ConsumerKind == CK_Br;
171 }
172 }
173
init(CfgNode * Node)174 void BoolFolding::init(CfgNode *Node) {
175 Producers.clear();
176 for (Inst &Instr : Node->getInsts()) {
177 if (Instr.isDeleted())
178 continue;
179 invalidateProducersOnStore(&Instr);
180 // Check whether Instr is a valid producer.
181 Variable *Var = Instr.getDest();
182 if (Var) { // only consider instructions with an actual dest var
183 if (isBooleanType(Var->getType())) { // only bool-type dest vars
184 if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
185 Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
186 }
187 }
188 }
189 // Check each src variable against the map.
190 FOREACH_VAR_IN_INST(Var, Instr) {
191 SizeT VarNum = Var->getIndex();
192 if (!containsValid(VarNum))
193 continue;
194 // All valid consumers use Var as the first source operand
195 if (IndexOfVarOperandInInst(Var) != 0) {
196 setInvalid(VarNum);
197 continue;
198 }
199 // Consumer instructions must be white-listed
200 BoolFolding::BoolFoldingConsumerKind ConsumerKind =
201 getConsumerKind(&Instr);
202 if (ConsumerKind == CK_None) {
203 setInvalid(VarNum);
204 continue;
205 }
206 BoolFolding::BoolFoldingProducerKind ProducerKind =
207 getProducerKind(Producers[VarNum].Instr);
208 if (!isValidFolding(ProducerKind, ConsumerKind)) {
209 setInvalid(VarNum);
210 continue;
211 }
212 // Avoid creating multiple copies of complex producer instructions.
213 if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
214 setInvalid(VarNum);
215 continue;
216 }
217 ++Producers[VarNum].NumUses;
218 if (Instr.isLastUse(Var)) {
219 Producers[VarNum].IsLiveOut = false;
220 }
221 }
222 }
223 for (auto &I : Producers) {
224 // Ignore entries previously marked invalid.
225 if (I.second.Instr == nullptr)
226 continue;
227 // Disable the producer if its dest may be live beyond this block.
228 if (I.second.IsLiveOut) {
229 setInvalid(I.first);
230 continue;
231 }
232 // Mark as "dead" rather than outright deleting. This is so that other
233 // peephole style optimizations during or before lowering have access to
234 // this instruction in undeleted form. See for example
235 // tryOptimizedCmpxchgCmpBr().
236 I.second.Instr->setDead();
237 }
238 }
239
getProducerFor(const Operand * Opnd) const240 const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
241 auto *Var = llvm::dyn_cast<const Variable>(Opnd);
242 if (Var == nullptr)
243 return nullptr;
244 SizeT VarNum = Var->getIndex();
245 auto Element = Producers.find(VarNum);
246 if (Element == Producers.end())
247 return nullptr;
248 return Element->second.Instr;
249 }
250
dump(const Cfg * Func) const251 void BoolFolding::dump(const Cfg *Func) const {
252 if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
253 return;
254 OstreamLocker L(Func->getContext());
255 Ostream &Str = Func->getContext()->getStrDump();
256 for (auto &I : Producers) {
257 if (I.second.Instr == nullptr)
258 continue;
259 Str << "Found foldable producer:\n ";
260 I.second.Instr->dump(Func);
261 Str << "\n";
262 }
263 }
264
265 /// If the given instruction has potential memory side effects (e.g. store, rmw,
266 /// or a call instruction with potential memory side effects), then we must not
267 /// allow a pre-store Producer instruction with memory operands to be folded
268 /// into a post-store Consumer instruction. If this is detected, the Producer
269 /// is invalidated.
270 ///
271 /// We use the Producer's IsLiveOut field to determine whether any potential
272 /// Consumers come after this store instruction. The IsLiveOut field is
273 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
274 /// sees the variable's definitive last use (indicating the variable is not in
275 /// the node's live-out set). Thus if we see here that IsLiveOut is false, we
276 /// know that there can be no consumers after the store, and therefore we know
277 /// the folding is safe despite the store instruction.
278
invalidateProducersOnStore(const Inst * Instr)279 void BoolFolding::invalidateProducersOnStore(const Inst *Instr) {
280 if (!Instr->isMemoryWrite())
281 return;
282 for (auto &ProducerPair : Producers) {
283 if (!ProducerPair.second.IsLiveOut)
284 continue;
285 Inst *PInst = ProducerPair.second.Instr;
286 if (PInst == nullptr)
287 continue;
288 bool HasMemOperand = false;
289 const SizeT SrcSize = PInst->getSrcSize();
290 for (SizeT I = 0; I < SrcSize; ++I) {
291 if (llvm::isa<X86OperandMem>(PInst->getSrc(I))) {
292 HasMemOperand = true;
293 break;
294 }
295 }
296 if (!HasMemOperand)
297 continue;
298 setInvalid(ProducerPair.first);
299 }
300 }
301
initNodeForLowering(CfgNode * Node)302 void TargetX8632::initNodeForLowering(CfgNode *Node) {
303 FoldingInfo.init(Node);
304 FoldingInfo.dump(Func);
305 }
306
TargetX8632(Cfg * Func)307 TargetX8632::TargetX8632(Cfg *Func) : TargetX86(Func) {}
308
staticInit(GlobalContext * Ctx)309 void TargetX8632::staticInit(GlobalContext *Ctx) {
310 RegNumT::setLimit(RegX8632::Reg_NUM);
311 RegX8632::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
312 for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
313 TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
314 filterTypeToRegisterSet(Ctx, RegX8632::Reg_NUM, TypeToRegisterSet.data(),
315 TypeToRegisterSet.size(), RegX8632::getRegName,
316 getRegClassName);
317 }
318
shouldBePooled(const Constant * C)319 bool TargetX8632::shouldBePooled(const Constant *C) {
320 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
321 return !Utils::isPositiveZero(ConstFloat->getValue());
322 }
323 if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
324 return !Utils::isPositiveZero(ConstDouble->getValue());
325 }
326 return false;
327 }
328
getPointerType()329 Type TargetX8632::getPointerType() { return IceType_i32; }
330
translateO2()331 void TargetX8632::translateO2() {
332 TimerMarker T(TimerStack::TT_O2, Func);
333
334 genTargetHelperCalls();
335 Func->dump("After target helper call insertion");
336
337 // Merge Alloca instructions, and lay out the stack.
338 static constexpr bool SortAndCombineAllocas = true;
339 Func->processAllocas(SortAndCombineAllocas);
340 Func->dump("After Alloca processing");
341
342 // Run this early so it can be used to focus optimizations on potentially hot
343 // code.
344 // TODO(stichnot,ascull): currently only used for regalloc not
345 // expensive high level optimizations which could be focused on potentially
346 // hot code.
347 Func->generateLoopInfo();
348 Func->dump("After loop analysis");
349 if (getFlags().getLoopInvariantCodeMotion()) {
350 Func->loopInvariantCodeMotion();
351 Func->dump("After LICM");
352 }
353
354 if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
355 Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
356 Func->dump("After Local CSE");
357 Func->floatConstantCSE();
358 }
359 if (getFlags().getEnableShortCircuit()) {
360 Func->shortCircuitJumps();
361 Func->dump("After Short Circuiting");
362 }
363
364 if (!getFlags().getEnablePhiEdgeSplit()) {
365 // Lower Phi instructions.
366 Func->placePhiLoads();
367 if (Func->hasError())
368 return;
369 Func->placePhiStores();
370 if (Func->hasError())
371 return;
372 Func->deletePhis();
373 if (Func->hasError())
374 return;
375 Func->dump("After Phi lowering");
376 }
377
378 // Address mode optimization.
379 Func->getVMetadata()->init(VMK_SingleDefs);
380 Func->doAddressOpt();
381 Func->materializeVectorShuffles();
382
383 // Find read-modify-write opportunities. Do this after address mode
384 // optimization so that doAddressOpt() doesn't need to be applied to RMW
385 // instructions as well.
386 findRMW();
387 Func->dump("After RMW transform");
388
389 // Argument lowering
390 Func->doArgLowering();
391
392 // Target lowering. This requires liveness analysis for some parts of the
393 // lowering decisions, such as compare/branch fusing. If non-lightweight
394 // liveness analysis is used, the instructions need to be renumbered first
395 // TODO: This renumbering should only be necessary if we're actually
396 // calculating live intervals, which we only do for register allocation.
397 Func->renumberInstructions();
398 if (Func->hasError())
399 return;
400
401 // TODO: It should be sufficient to use the fastest liveness calculation,
402 // i.e. livenessLightweight(). However, for some reason that slows down the
403 // rest of the translation. Investigate.
404 Func->liveness(Liveness_Basic);
405 if (Func->hasError())
406 return;
407 Func->dump("After x86 address mode opt");
408
409 doLoadOpt();
410
411 Func->genCode();
412 if (Func->hasError())
413 return;
414 Func->dump("After x86 codegen");
415 splitBlockLocalVariables(Func);
416
417 // Register allocation. This requires instruction renumbering and full
418 // liveness analysis. Loops must be identified before liveness so variable
419 // use weights are correct.
420 Func->renumberInstructions();
421 if (Func->hasError())
422 return;
423 Func->liveness(Liveness_Intervals);
424 if (Func->hasError())
425 return;
426 // The post-codegen dump is done here, after liveness analysis and associated
427 // cleanup, to make the dump cleaner and more useful.
428 Func->dump("After initial x86 codegen");
429 // Validate the live range computations. The expensive validation call is
430 // deliberately only made when assertions are enabled.
431 assert(Func->validateLiveness());
432 Func->getVMetadata()->init(VMK_All);
433 regAlloc(RAK_Global);
434 if (Func->hasError())
435 return;
436 Func->dump("After linear scan regalloc");
437
438 if (getFlags().getEnablePhiEdgeSplit()) {
439 Func->advancedPhiLowering();
440 Func->dump("After advanced Phi lowering");
441 }
442
443 // Stack frame mapping.
444 Func->genFrame();
445 if (Func->hasError())
446 return;
447 Func->dump("After stack frame mapping");
448
449 Func->contractEmptyNodes();
450 Func->reorderNodes();
451
452 // Branch optimization. This needs to be done just before code emission. In
453 // particular, no transformations that insert or reorder CfgNodes should be
454 // done after branch optimization. We go ahead and do it before nop insertion
455 // to reduce the amount of work needed for searching for opportunities.
456 Func->doBranchOpt();
457 Func->dump("After branch optimization");
458 }
459
translateOm1()460 void TargetX8632::translateOm1() {
461 TimerMarker T(TimerStack::TT_Om1, Func);
462
463 genTargetHelperCalls();
464
465 // Do not merge Alloca instructions, and lay out the stack.
466 // static constexpr bool SortAndCombineAllocas = false;
467 static constexpr bool SortAndCombineAllocas =
468 true; // TODO(b/171222930): Fix Win32 bug when this is false
469 Func->processAllocas(SortAndCombineAllocas);
470 Func->dump("After Alloca processing");
471
472 Func->placePhiLoads();
473 if (Func->hasError())
474 return;
475 Func->placePhiStores();
476 if (Func->hasError())
477 return;
478 Func->deletePhis();
479 if (Func->hasError())
480 return;
481 Func->dump("After Phi lowering");
482
483 Func->doArgLowering();
484 Func->genCode();
485 if (Func->hasError())
486 return;
487 Func->dump("After initial x86 codegen");
488
489 regAlloc(RAK_InfOnly);
490 if (Func->hasError())
491 return;
492 Func->dump("After regalloc of infinite-weight variables");
493
494 Func->genFrame();
495 if (Func->hasError())
496 return;
497 Func->dump("After stack frame mapping");
498 }
499
canRMW(const InstArithmetic * Arith)500 inline bool canRMW(const InstArithmetic *Arith) {
501 Type Ty = Arith->getDest()->getType();
502 // X86 vector instructions write to a register and have no RMW option.
503 if (isVectorType(Ty))
504 return false;
505 bool isI64 = Ty == IceType_i64;
506
507 switch (Arith->getOp()) {
508 // Not handled for lack of simple lowering:
509 // shift on i64
510 // mul, udiv, urem, sdiv, srem, frem
511 // Not handled for lack of RMW instructions:
512 // fadd, fsub, fmul, fdiv (also vector types)
513 default:
514 return false;
515 case InstArithmetic::Add:
516 case InstArithmetic::Sub:
517 case InstArithmetic::And:
518 case InstArithmetic::Or:
519 case InstArithmetic::Xor:
520 return true;
521 case InstArithmetic::Shl:
522 case InstArithmetic::Lshr:
523 case InstArithmetic::Ashr:
524 return false; // TODO(stichnot): implement
525 return !isI64;
526 }
527 }
528
isSameMemAddressOperand(const Operand * A,const Operand * B)529 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
530 if (A == B)
531 return true;
532 if (auto *MemA = llvm::dyn_cast<X86OperandMem>(A)) {
533 if (auto *MemB = llvm::dyn_cast<X86OperandMem>(B)) {
534 return MemA->getBase() == MemB->getBase() &&
535 MemA->getOffset() == MemB->getOffset() &&
536 MemA->getIndex() == MemB->getIndex() &&
537 MemA->getShift() == MemB->getShift() &&
538 MemA->getSegmentRegister() == MemB->getSegmentRegister();
539 }
540 }
541 return false;
542 }
543
findRMW()544 void TargetX8632::findRMW() {
545 TimerMarker _(TimerStack::TT_findRMW, Func);
546 Func->dump("Before RMW");
547 if (Func->isVerbose(IceV_RMW))
548 Func->getContext()->lockStr();
549 for (CfgNode *Node : Func->getNodes()) {
550 // Walk through the instructions, considering each sequence of 3
551 // instructions, and look for the particular RMW pattern. Note that this
552 // search can be "broken" (false negatives) if there are intervening
553 // deleted instructions, or intervening instructions that could be safely
554 // moved out of the way to reveal an RMW pattern.
555 auto E = Node->getInsts().end();
556 auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
557 for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
558 // Make I3 skip over deleted instructions.
559 while (I3 != E && I3->isDeleted())
560 ++I3;
561 if (I1 == E || I2 == E || I3 == E)
562 continue;
563 assert(!I1->isDeleted());
564 assert(!I2->isDeleted());
565 assert(!I3->isDeleted());
566 auto *Load = llvm::dyn_cast<InstLoad>(I1);
567 auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
568 auto *Store = llvm::dyn_cast<InstStore>(I3);
569 if (!Load || !Arith || !Store)
570 continue;
571 // Look for:
572 // a = Load addr
573 // b = <op> a, other
574 // Store b, addr
575 // Change to:
576 // a = Load addr
577 // b = <op> a, other
578 // x = FakeDef
579 // RMW <op>, addr, other, x
580 // b = Store b, addr, x
581 // Note that inferTwoAddress() makes sure setDestRedefined() gets called
582 // on the updated Store instruction, to avoid liveness problems later.
583 //
584 // With this transformation, the Store instruction acquires a Dest
585 // variable and is now subject to dead code elimination if there are no
586 // more uses of "b". Variable "x" is a beacon for determining whether the
587 // Store instruction gets dead-code eliminated. If the Store instruction
588 // is eliminated, then it must be the case that the RMW instruction ends
589 // x's live range, and therefore the RMW instruction will be retained and
590 // later lowered. On the other hand, if the RMW instruction does not end
591 // x's live range, then the Store instruction must still be present, and
592 // therefore the RMW instruction is ignored during lowering because it is
593 // redundant with the Store instruction.
594 //
595 // Note that if "a" has further uses, the RMW transformation may still
596 // trigger, resulting in two loads and one store, which is worse than the
597 // original one load and one store. However, this is probably rare, and
598 // caching probably keeps it just as fast.
599 if (!isSameMemAddressOperand(Load->getLoadAddress(),
600 Store->getStoreAddress()))
601 continue;
602 Operand *ArithSrcFromLoad = Arith->getSrc(0);
603 Operand *ArithSrcOther = Arith->getSrc(1);
604 if (ArithSrcFromLoad != Load->getDest()) {
605 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
606 continue;
607 std::swap(ArithSrcFromLoad, ArithSrcOther);
608 }
609 if (Arith->getDest() != Store->getData())
610 continue;
611 if (!canRMW(Arith))
612 continue;
613 if (Func->isVerbose(IceV_RMW)) {
614 Ostream &Str = Func->getContext()->getStrDump();
615 Str << "Found RMW in " << Func->getFunctionName() << ":\n ";
616 Load->dump(Func);
617 Str << "\n ";
618 Arith->dump(Func);
619 Str << "\n ";
620 Store->dump(Func);
621 Str << "\n";
622 }
623 Variable *Beacon = Func->makeVariable(IceType_i32);
624 Beacon->setMustNotHaveReg();
625 Store->setRmwBeacon(Beacon);
626 auto *BeaconDef = InstFakeDef::create(Func, Beacon);
627 Node->getInsts().insert(I3, BeaconDef);
628 auto *RMW =
629 InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
630 Beacon, Arith->getOp());
631 Node->getInsts().insert(I3, RMW);
632 }
633 }
634 if (Func->isVerbose(IceV_RMW))
635 Func->getContext()->unlockStr();
636 }
637
638 /// Value is in bytes. Return Value adjusted to the next highest multiple of
639 /// the stack alignment.
applyStackAlignment(uint32_t Value)640 uint32_t TargetX8632::applyStackAlignment(uint32_t Value) {
641 return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
642 }
643
644 // Converts a ConstantInteger32 operand into its constant value, or
645 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
getConstantMemoryOrder(Operand * Opnd)646 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
647 if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
648 return Integer->getValue();
649 return Intrinsics::MemoryOrderInvalid;
650 }
651
652 /// Determines whether the dest of a Load instruction can be folded into one of
653 /// the src operands of a 2-operand instruction. This is true as long as the
654 /// load dest matches exactly one of the binary instruction's src operands.
655 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
canFoldLoadIntoBinaryInst(Operand * LoadSrc,Variable * LoadDest,Operand * & Src0,Operand * & Src1)656 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
657 Operand *&Src0, Operand *&Src1) {
658 if (Src0 == LoadDest && Src1 != LoadDest) {
659 Src0 = LoadSrc;
660 return true;
661 }
662 if (Src0 != LoadDest && Src1 == LoadDest) {
663 Src1 = LoadSrc;
664 return true;
665 }
666 return false;
667 }
668
doLoadOpt()669 void TargetX8632::doLoadOpt() {
670 TimerMarker _(TimerStack::TT_loadOpt, Func);
671 for (CfgNode *Node : Func->getNodes()) {
672 Context.init(Node);
673 while (!Context.atEnd()) {
674 Variable *LoadDest = nullptr;
675 Operand *LoadSrc = nullptr;
676 Inst *CurInst = iteratorToInst(Context.getCur());
677 Inst *Next = Context.getNextInst();
678 // Determine whether the current instruction is a Load instruction or
679 // equivalent.
680 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
681 // An InstLoad qualifies unless it uses a 64-bit absolute address,
682 // which requires legalization to insert a copy to register.
683 // TODO(b/148272103): Fold these after legalization.
684 LoadDest = Load->getDest();
685 constexpr bool DoLegalize = false;
686 LoadSrc = formMemoryOperand(Load->getLoadAddress(), LoadDest->getType(),
687 DoLegalize);
688 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
689 // An AtomicLoad intrinsic qualifies as long as it has a valid memory
690 // ordering, and can be implemented in a single instruction (i.e., not
691 // i64 on x86-32).
692 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
693 if (ID == Intrinsics::AtomicLoad &&
694 (Intrin->getDest()->getType() != IceType_i64) &&
695 Intrinsics::isMemoryOrderValid(
696 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
697 LoadDest = Intrin->getDest();
698 constexpr bool DoLegalize = false;
699 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
700 DoLegalize);
701 }
702 }
703 // A Load instruction can be folded into the following instruction only
704 // if the following instruction ends the Load's Dest variable's live
705 // range.
706 if (LoadDest && Next && Next->isLastUse(LoadDest)) {
707 assert(LoadSrc);
708 Inst *NewInst = nullptr;
709 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
710 Operand *Src0 = Arith->getSrc(0);
711 Operand *Src1 = Arith->getSrc(1);
712 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
713 NewInst = InstArithmetic::create(Func, Arith->getOp(),
714 Arith->getDest(), Src0, Src1);
715 }
716 } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
717 Operand *Src0 = Icmp->getSrc(0);
718 Operand *Src1 = Icmp->getSrc(1);
719 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
720 NewInst = InstIcmp::create(Func, Icmp->getCondition(),
721 Icmp->getDest(), Src0, Src1);
722 }
723 } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
724 Operand *Src0 = Fcmp->getSrc(0);
725 Operand *Src1 = Fcmp->getSrc(1);
726 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
727 NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
728 Fcmp->getDest(), Src0, Src1);
729 }
730 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
731 Operand *Src0 = Select->getTrueOperand();
732 Operand *Src1 = Select->getFalseOperand();
733 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
734 NewInst = InstSelect::create(Func, Select->getDest(),
735 Select->getCondition(), Src0, Src1);
736 }
737 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
738 // The load dest can always be folded into a Cast instruction.
739 auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
740 if (Src0 == LoadDest) {
741 NewInst = InstCast::create(Func, Cast->getCastKind(),
742 Cast->getDest(), LoadSrc);
743 }
744 }
745 if (NewInst) {
746 CurInst->setDeleted();
747 Next->setDeleted();
748 Context.insert(NewInst);
749 // Update NewInst->LiveRangesEnded so that target lowering may
750 // benefit. Also update NewInst->HasSideEffects.
751 NewInst->spliceLivenessInfo(Next, CurInst);
752 }
753 }
754 Context.advanceCur();
755 Context.advanceNext();
756 }
757 }
758 Func->dump("After load optimization");
759 }
760
doBranchOpt(Inst * I,const CfgNode * NextNode)761 bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
762 if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
763 return Br->optimizeBranch(NextNode);
764 }
765 return false;
766 }
767
getPhysicalRegister(RegNumT RegNum,Type Ty)768 Variable *TargetX8632::getPhysicalRegister(RegNumT RegNum, Type Ty) {
769 if (Ty == IceType_void)
770 Ty = IceType_i32;
771 if (PhysicalRegisters[Ty].empty())
772 PhysicalRegisters[Ty].resize(RegX8632::Reg_NUM);
773 assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
774 Variable *Reg = PhysicalRegisters[Ty][RegNum];
775 if (Reg == nullptr) {
776 Reg = Func->makeVariable(Ty);
777 Reg->setRegNum(RegNum);
778 PhysicalRegisters[Ty][RegNum] = Reg;
779 // Specially mark a named physical register as an "argument" so that it is
780 // considered live upon function entry. Otherwise it's possible to get
781 // liveness validation errors for saving callee-save registers.
782 Func->addImplicitArg(Reg);
783 // Don't bother tracking the live range of a named physical register.
784 Reg->setIgnoreLiveness();
785 }
786 assert(RegX8632::getGprForType(Ty, RegNum) == RegNum);
787 return Reg;
788 }
789
getRegName(RegNumT RegNum,Type Ty) const790 const char *TargetX8632::getRegName(RegNumT RegNum, Type Ty) const {
791 return RegX8632::getRegName(RegX8632::getGprForType(Ty, RegNum));
792 }
793
emitVariable(const Variable * Var) const794 void TargetX8632::emitVariable(const Variable *Var) const {
795 if (!BuildDefs::dump())
796 return;
797 Ostream &Str = Ctx->getStrEmit();
798 if (Var->hasReg()) {
799 Str << "%" << getRegName(Var->getRegNum(), Var->getType());
800 return;
801 }
802 if (Var->mustHaveReg()) {
803 llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
804 ") has no register assigned - function " +
805 Func->getFunctionName());
806 }
807 const int32_t Offset = Var->getStackOffset();
808 auto BaseRegNum = Var->getBaseRegNum();
809 if (BaseRegNum.hasNoValue())
810 BaseRegNum = getFrameOrStackReg();
811
812 // Print in the form "Offset(%reg)", omitting Offset when it is 0.
813 if (getFlags().getDecorateAsm()) {
814 Str << Var->getSymbolicStackOffset();
815 } else if (Offset != 0) {
816 Str << Offset;
817 }
818 const Type FrameSPTy = WordType;
819 Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
820 }
821
addProlog(CfgNode * Node)822 void TargetX8632::addProlog(CfgNode *Node) {
823 // Stack frame layout:
824 //
825 // +------------------------+ ^ +
826 // | 1. return address | |
827 // +------------------------+ v -
828 // | 2. preserved registers |
829 // +------------------------+ <--- BasePointer (if used)
830 // | 3. padding |
831 // +------------------------+
832 // | 4. global spill area |
833 // +------------------------+
834 // | 5. padding |
835 // +------------------------+
836 // | 6. local spill area |
837 // +------------------------+
838 // | 7. padding |
839 // +------------------------+
840 // | 7.5 shadow (WinX64) |
841 // +------------------------+
842 // | 8. allocas |
843 // +------------------------+
844 // | 9. padding |
845 // +------------------------+
846 // | 10. out args |
847 // +------------------------+ <--- StackPointer
848 //
849 // The following variables record the size in bytes of the given areas:
850 // * X86_RET_IP_SIZE_BYTES: area 1
851 // * PreservedRegsSizeBytes: area 2
852 // * SpillAreaPaddingBytes: area 3
853 // * GlobalsSize: area 4
854 // * LocalsSlotsPaddingBytes: area 5
855 // * GlobalsAndSubsequentPaddingSize: areas 4 - 5
856 // * LocalsSpillAreaSize: area 6
857 // * FixedAllocaSizeBytes: areas 7 - 8
858 // * SpillAreaSizeBytes: areas 3 - 10
859 // * maxOutArgsSizeBytes(): areas 9 - 10
860
861 // Determine stack frame offsets for each Variable without a register
862 // assignment. This can be done as one variable per stack slot. Or, do
863 // coalescing by running the register allocator again with an infinite set of
864 // registers (as a side effect, this gives variables a second chance at
865 // physical register assignment).
866 //
867 // A middle ground approach is to leverage sparsity and allocate one block of
868 // space on the frame for globals (variables with multi-block lifetime), and
869 // one block to share for locals (single-block lifetime).
870
871 // StackPointer: points just past return address of calling function
872
873 Context.init(Node);
874 Context.setInsertPoint(Context.getCur());
875
876 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
877 RegsUsed = SmallBitVector(CalleeSaves.size());
878 VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
879 size_t GlobalsSize = 0;
880 // If there is a separate locals area, this represents that area. Otherwise
881 // it counts any variable not counted by GlobalsSize.
882 SpillAreaSizeBytes = 0;
883 // If there is a separate locals area, this specifies the alignment for it.
884 uint32_t LocalsSlotsAlignmentBytes = 0;
885 // The entire spill locations area gets aligned to largest natural alignment
886 // of the variables that have a spill slot.
887 uint32_t SpillAreaAlignmentBytes = 0;
888 // A spill slot linked to a variable with a stack slot should reuse that
889 // stack slot.
890 std::function<bool(Variable *)> TargetVarHook =
891 [&VariablesLinkedToSpillSlots](Variable *Var) {
892 // TODO(stichnot): Refactor this into the base class.
893 Variable *Root = Var->getLinkedToStackRoot();
894 if (Root != nullptr) {
895 assert(!Root->hasReg());
896 if (!Root->hasReg()) {
897 VariablesLinkedToSpillSlots.push_back(Var);
898 return true;
899 }
900 }
901 return false;
902 };
903
904 // Compute the list of spilled variables and bounds for GlobalsSize, etc.
905 getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
906 &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
907 &LocalsSlotsAlignmentBytes, TargetVarHook);
908 uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
909 SpillAreaSizeBytes += GlobalsSize;
910
911 // Add push instructions for preserved registers.
912 uint32_t NumCallee = 0;
913 size_t PreservedRegsSizeBytes = 0;
914 SmallBitVector Pushed(CalleeSaves.size());
915 for (RegNumT i : RegNumBVIter(CalleeSaves)) {
916 const auto Canonical = RegX8632::getBaseReg(i);
917 assert(Canonical == RegX8632::getBaseReg(Canonical));
918 if (RegsUsed[i]) {
919 Pushed[Canonical] = true;
920 }
921 }
922 for (RegNumT RegNum : RegNumBVIter(Pushed)) {
923 assert(RegNum == RegX8632::getBaseReg(RegNum));
924 ++NumCallee;
925 if (RegX8632::isXmm(RegNum)) {
926 PreservedRegsSizeBytes += 16;
927 } else {
928 PreservedRegsSizeBytes += typeWidthInBytes(WordType);
929 }
930 _push_reg(RegNum);
931 }
932 Ctx->statsUpdateRegistersSaved(NumCallee);
933
934 // StackPointer: points past preserved registers at start of spill area
935
936 // Generate "push frameptr; mov frameptr, stackptr"
937 if (IsEbpBasedFrame) {
938 assert(
939 (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
940 0);
941 PreservedRegsSizeBytes += typeWidthInBytes(WordType);
942 _link_bp();
943 }
944
945 // Align the variables area. SpillAreaPaddingBytes is the size of the region
946 // after the preserved registers and before the spill areas.
947 // LocalsSlotsPaddingBytes is the amount of padding between the globals and
948 // locals area if they are separate.
949 assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
950 uint32_t SpillAreaPaddingBytes = 0;
951 uint32_t LocalsSlotsPaddingBytes = 0;
952 alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
953 SpillAreaAlignmentBytes, GlobalsSize,
954 LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
955 &LocalsSlotsPaddingBytes);
956 SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
957 uint32_t GlobalsAndSubsequentPaddingSize =
958 GlobalsSize + LocalsSlotsPaddingBytes;
959
960 // Functions returning scalar floating point types may need to convert values
961 // from an in-register xmm value to the top of the x87 floating point stack.
962 // This is done by a movp[sd] and an fld[sd]. Ensure there is enough scratch
963 // space on the stack for this.
964 const Type ReturnType = Func->getReturnType();
965 if (isScalarFloatingType(ReturnType)) {
966 // Avoid misaligned double-precision load/store.
967 RequiredStackAlignment =
968 std::max<size_t>(RequiredStackAlignment, X86_STACK_ALIGNMENT_BYTES);
969 SpillAreaSizeBytes =
970 std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
971 }
972
973 RequiredStackAlignment =
974 std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
975
976 if (PrologEmitsFixedAllocas) {
977 RequiredStackAlignment =
978 std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
979 }
980
981 // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
982 // fixed allocations in the prolog.
983 if (PrologEmitsFixedAllocas)
984 SpillAreaSizeBytes += FixedAllocaSizeBytes;
985
986 // Entering the function has made the stack pointer unaligned. Re-align it by
987 // adjusting the stack size.
988 // Note that StackOffset does not include spill area. It's the offset from the
989 // base stack pointer (epb), whether we set it or not, to the the first stack
990 // arg (if any). StackSize, on the other hand, does include the spill area.
991 const uint32_t StackOffset = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
992 uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
993 RequiredStackAlignment);
994 StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
995 RequiredStackAlignment);
996 SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
997
998 if (SpillAreaSizeBytes) {
999 auto *Func = Node->getCfg();
1000 if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
1001 Func->setError("Stack size limit exceeded");
1002 }
1003
1004 emitStackProbe(SpillAreaSizeBytes);
1005
1006 // Generate "sub stackptr, SpillAreaSizeBytes"
1007 _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1008 }
1009
1010 // StackPointer: points just past the spill area (end of stack frame)
1011
1012 // If the required alignment is greater than the stack pointer's guaranteed
1013 // alignment, align the stack pointer accordingly.
1014 if (RequiredStackAlignment > X86_STACK_ALIGNMENT_BYTES) {
1015 assert(IsEbpBasedFrame);
1016 _and(getPhysicalRegister(getStackReg(), WordType),
1017 Ctx->getConstantInt32(-RequiredStackAlignment));
1018 }
1019
1020 // StackPointer: may have just been offset for alignment
1021
1022 // Account for known-frame-offset alloca instructions that were not already
1023 // combined into the prolog.
1024 if (!PrologEmitsFixedAllocas)
1025 SpillAreaSizeBytes += FixedAllocaSizeBytes;
1026
1027 Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1028
1029 // Fill in stack offsets for stack args, and copy args into registers for
1030 // those that were register-allocated. Args are pushed right to left, so
1031 // Arg[0] is closest to the stack/frame pointer.
1032 RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
1033 Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, WordType);
1034 size_t BasicFrameOffset = StackOffset;
1035 if (!IsEbpBasedFrame)
1036 BasicFrameOffset += SpillAreaSizeBytes;
1037
1038 const VarList &Args = Func->getArgs();
1039 size_t InArgsSizeBytes = 0;
1040 unsigned NumXmmArgs = 0;
1041 unsigned NumGPRArgs = 0;
1042 for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
1043 Variable *Arg = Args[i];
1044 // Skip arguments passed in registers.
1045 if (isVectorType(Arg->getType())) {
1046 if (RegX8632::getRegisterForXmmArgNum(
1047 RegX8632::getArgIndex(i, NumXmmArgs))
1048 .hasValue()) {
1049 ++NumXmmArgs;
1050 continue;
1051 }
1052 } else if (!isScalarFloatingType(Arg->getType())) {
1053 assert(isScalarIntegerType(Arg->getType()));
1054 if (RegX8632::getRegisterForGprArgNum(
1055 WordType, RegX8632::getArgIndex(i, NumGPRArgs))
1056 .hasValue()) {
1057 ++NumGPRArgs;
1058 continue;
1059 }
1060 }
1061 // For esp-based frames where the allocas are done outside the prolog, the
1062 // esp value may not stabilize to its home value until after all the
1063 // fixed-size alloca instructions have executed. In this case, a stack
1064 // adjustment is needed when accessing in-args in order to copy them into
1065 // registers.
1066 size_t StackAdjBytes = 0;
1067 if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1068 StackAdjBytes -= FixedAllocaSizeBytes;
1069 finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1070 InArgsSizeBytes);
1071 }
1072
1073 // Fill in stack offsets for locals.
1074 assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1075 SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1076 IsEbpBasedFrame && !needsStackPointerAlignment());
1077 // Assign stack offsets to variables that have been linked to spilled
1078 // variables.
1079 for (Variable *Var : VariablesLinkedToSpillSlots) {
1080 const Variable *Root = Var->getLinkedToStackRoot();
1081 assert(Root != nullptr);
1082 Var->setStackOffset(Root->getStackOffset());
1083
1084 // If the stack root variable is an arg, make this variable an arg too so
1085 // that stackVarToAsmAddress uses the correct base pointer (e.g. ebp on
1086 // x86).
1087 Var->setIsArg(Root->getIsArg());
1088 }
1089 this->HasComputedFrame = true;
1090
1091 if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1092 OstreamLocker L(Func->getContext());
1093 Ostream &Str = Func->getContext()->getStrDump();
1094
1095 Str << "Stack layout:\n";
1096 uint32_t EspAdjustmentPaddingSize =
1097 SpillAreaSizeBytes - LocalsSpillAreaSize -
1098 GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1099 maxOutArgsSizeBytes();
1100 Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1101 << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
1102 << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1103 << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1104 << " globals spill area = " << GlobalsSize << " bytes\n"
1105 << " globals-locals spill areas intermediate padding = "
1106 << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1107 << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1108 << " esp alignment padding = " << EspAdjustmentPaddingSize
1109 << " bytes\n";
1110
1111 Str << "Stack details:\n"
1112 << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1113 << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1114 << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1115 << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1116 << " bytes\n"
1117 << " is ebp based = " << IsEbpBasedFrame << "\n";
1118 }
1119 }
1120
1121 /// Helper function for addProlog().
1122 ///
1123 /// This assumes Arg is an argument passed on the stack. This sets the frame
1124 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1125 /// I64 arg that has been split into Lo and Hi components, it calls itself
1126 /// recursively on the components, taking care to handle Lo first because of the
1127 /// little-endian architecture. Lastly, this function generates an instruction
1128 /// to copy Arg into its assigned register if applicable.
1129
finishArgumentLowering(Variable * Arg,Variable * FramePtr,size_t BasicFrameOffset,size_t StackAdjBytes,size_t & InArgsSizeBytes)1130 void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
1131 size_t BasicFrameOffset,
1132 size_t StackAdjBytes,
1133 size_t &InArgsSizeBytes) {
1134 if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
1135 Variable *Lo = Arg64On32->getLo();
1136 Variable *Hi = Arg64On32->getHi();
1137 finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
1138 InArgsSizeBytes);
1139 finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
1140 InArgsSizeBytes);
1141 return;
1142 }
1143 Type Ty = Arg->getType();
1144 if (isVectorType(Ty)) {
1145 InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
1146 }
1147 Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1148 InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1149 if (Arg->hasReg()) {
1150 assert(Ty != IceType_i64);
1151 auto *Mem = X86OperandMem::create(
1152 Func, Ty, FramePtr,
1153 Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1154 if (isVectorType(Arg->getType())) {
1155 _movp(Arg, Mem);
1156 } else {
1157 _mov(Arg, Mem);
1158 }
1159 // This argument-copying instruction uses an explicit X86OperandMem
1160 // operand instead of a Variable, so its fill-from-stack operation has to
1161 // be tracked separately for statistics.
1162 Ctx->statsUpdateFills();
1163 }
1164 }
1165
addEpilog(CfgNode * Node)1166 void TargetX8632::addEpilog(CfgNode *Node) {
1167 InstList &Insts = Node->getInsts();
1168 InstList::reverse_iterator RI, E;
1169 for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1170 if (llvm::isa<Insts::Ret>(*RI))
1171 break;
1172 }
1173 if (RI == E)
1174 return;
1175
1176 // Convert the reverse_iterator position into its corresponding (forward)
1177 // iterator position.
1178 InstList::iterator InsertPoint = reverseToForwardIterator(RI);
1179 --InsertPoint;
1180 Context.init(Node);
1181 Context.setInsertPoint(InsertPoint);
1182
1183 if (IsEbpBasedFrame) {
1184 _unlink_bp();
1185 } else {
1186 // add stackptr, SpillAreaSizeBytes
1187 if (SpillAreaSizeBytes != 0) {
1188 _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1189 }
1190 }
1191
1192 // Add pop instructions for preserved registers.
1193 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1194 SmallBitVector Popped(CalleeSaves.size());
1195 for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1196 const auto RegNum = RegNumT::fromInt(i);
1197 if (RegNum == getFrameReg() && IsEbpBasedFrame)
1198 continue;
1199 const RegNumT Canonical = RegX8632::getBaseReg(RegNum);
1200 if (CalleeSaves[i] && RegsUsed[i]) {
1201 Popped[Canonical] = true;
1202 }
1203 }
1204 for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1205 if (!Popped[i])
1206 continue;
1207 const auto RegNum = RegNumT::fromInt(i);
1208 assert(RegNum == RegX8632::getBaseReg(RegNum));
1209 _pop_reg(RegNum);
1210 }
1211 }
1212
stackSlotType()1213 Type TargetX8632::stackSlotType() { return WordType; }
1214
loOperand(Operand * Operand)1215 Operand *TargetX8632::loOperand(Operand *Operand) {
1216 assert(Operand->getType() == IceType_i64 ||
1217 Operand->getType() == IceType_f64);
1218 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1219 return Operand;
1220 if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1221 return Var64On32->getLo();
1222 if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1223 auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1224 Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
1225 // Check if we need to blind/pool the constant.
1226 return legalize(ConstInt);
1227 }
1228 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1229 auto *MemOperand = X86OperandMem::create(
1230 Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
1231 Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1232 // Test if we should randomize or pool the offset, if so randomize it or
1233 // pool it then create mem operand with the blinded/pooled constant.
1234 // Otherwise, return the mem operand as ordinary mem operand.
1235 return legalize(MemOperand);
1236 }
1237 llvm_unreachable("Unsupported operand type");
1238 return nullptr;
1239 }
1240
hiOperand(Operand * Operand)1241 Operand *TargetX8632::hiOperand(Operand *Operand) {
1242 assert(Operand->getType() == IceType_i64 ||
1243 Operand->getType() == IceType_f64);
1244 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1245 return Operand;
1246 if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1247 return Var64On32->getHi();
1248 if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1249 auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1250 Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
1251 // Check if we need to blind/pool the constant.
1252 return legalize(ConstInt);
1253 }
1254 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1255 Constant *Offset = Mem->getOffset();
1256 if (Offset == nullptr) {
1257 Offset = Ctx->getConstantInt32(4);
1258 } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
1259 Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
1260 } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
1261 assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
1262 Offset =
1263 Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
1264 }
1265 auto *MemOperand = X86OperandMem::create(
1266 Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
1267 Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1268 // Test if the Offset is an eligible i32 constants for randomization and
1269 // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
1270 // operand.
1271 return legalize(MemOperand);
1272 }
1273 llvm_unreachable("Unsupported operand type");
1274 return nullptr;
1275 }
1276
getRegisterSet(RegSetMask Include,RegSetMask Exclude) const1277 SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,
1278 RegSetMask Exclude) const {
1279 return RegX8632::getRegisterSet(getFlags(), Include, Exclude);
1280 }
1281
lowerAlloca(const InstAlloca * Instr)1282 void TargetX8632::lowerAlloca(const InstAlloca *Instr) {
1283 // Conservatively require the stack to be aligned. Some stack adjustment
1284 // operations implemented below assume that the stack is aligned before the
1285 // alloca. All the alloca code ensures that the stack alignment is preserved
1286 // after the alloca. The stack alignment restriction can be relaxed in some
1287 // cases.
1288 RequiredStackAlignment =
1289 std::max<size_t>(RequiredStackAlignment, X86_STACK_ALIGNMENT_BYTES);
1290
1291 // For default align=0, set it to the real value 1, to avoid any
1292 // bit-manipulation problems below.
1293 const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1294
1295 // LLVM enforces power of 2 alignment.
1296 assert(llvm::isPowerOf2_32(AlignmentParam));
1297 assert(llvm::isPowerOf2_32(X86_STACK_ALIGNMENT_BYTES));
1298
1299 const uint32_t Alignment =
1300 std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
1301 const bool OverAligned = Alignment > X86_STACK_ALIGNMENT_BYTES;
1302 const bool OptM1 = Func->getOptLevel() == Opt_m1;
1303 const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1304 const bool UseFramePointer =
1305 hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1306
1307 if (UseFramePointer)
1308 setHasFramePointer();
1309
1310 Variable *esp = getPhysicalRegister(getStackReg(), WordType);
1311 if (OverAligned) {
1312 _and(esp, Ctx->getConstantInt32(-Alignment));
1313 }
1314
1315 Variable *Dest = Instr->getDest();
1316 Operand *TotalSize = legalize(Instr->getSizeInBytes());
1317
1318 if (const auto *ConstantTotalSize =
1319 llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1320 const uint32_t Value =
1321 Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1322 if (UseFramePointer) {
1323 _sub_sp(Ctx->getConstantInt32(Value));
1324 } else {
1325 // If we don't need a Frame Pointer, this alloca has a known offset to the
1326 // stack pointer. We don't need adjust the stack pointer, nor assign any
1327 // value to Dest, as Dest is rematerializable.
1328 assert(Dest->isRematerializable());
1329 FixedAllocaSizeBytes += Value;
1330 Context.insert<InstFakeDef>(Dest);
1331 }
1332 } else {
1333 // Non-constant sizes need to be adjusted to the next highest multiple of
1334 // the required alignment at runtime.
1335 Variable *T = makeReg(IceType_i32);
1336 _mov(T, TotalSize);
1337 _add(T, Ctx->getConstantInt32(Alignment - 1));
1338 _and(T, Ctx->getConstantInt32(-Alignment));
1339 _sub_sp(T);
1340 }
1341 // Add enough to the returned address to account for the out args area.
1342 uint32_t OutArgsSize = maxOutArgsSizeBytes();
1343 if (OutArgsSize > 0) {
1344 Variable *T = makeReg(Dest->getType());
1345 auto *CalculateOperand = X86OperandMem::create(
1346 Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1347 _lea(T, CalculateOperand);
1348 _mov(Dest, T);
1349 } else {
1350 _mov(Dest, esp);
1351 }
1352 }
1353
lowerArguments()1354 void TargetX8632::lowerArguments() {
1355 const bool OptM1 = Func->getOptLevel() == Opt_m1;
1356 VarList &Args = Func->getArgs();
1357 unsigned NumXmmArgs = 0;
1358 bool XmmSlotsRemain = true;
1359 unsigned NumGprArgs = 0;
1360 bool GprSlotsRemain = true;
1361
1362 Context.init(Func->getEntryNode());
1363 Context.setInsertPoint(Context.getCur());
1364
1365 for (SizeT i = 0, End = Args.size();
1366 i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1367 Variable *Arg = Args[i];
1368 Type Ty = Arg->getType();
1369 Variable *RegisterArg = nullptr;
1370 RegNumT RegNum;
1371 if (isVectorType(Ty)) {
1372 RegNum = RegX8632::getRegisterForXmmArgNum(
1373 RegX8632::getArgIndex(i, NumXmmArgs));
1374 if (RegNum.hasNoValue()) {
1375 XmmSlotsRemain = false;
1376 continue;
1377 }
1378 ++NumXmmArgs;
1379 RegisterArg = Func->makeVariable(Ty);
1380 } else if (isScalarFloatingType(Ty)) {
1381 continue;
1382 } else if (isScalarIntegerType(Ty)) {
1383 RegNum = RegX8632::getRegisterForGprArgNum(
1384 Ty, RegX8632::getArgIndex(i, NumGprArgs));
1385 if (RegNum.hasNoValue()) {
1386 GprSlotsRemain = false;
1387 continue;
1388 }
1389 ++NumGprArgs;
1390 RegisterArg = Func->makeVariable(Ty);
1391 }
1392 assert(RegNum.hasValue());
1393 assert(RegisterArg != nullptr);
1394 // Replace Arg in the argument list with the home register. Then generate
1395 // an instruction in the prolog to copy the home register to the assigned
1396 // location of Arg.
1397 if (BuildDefs::dump())
1398 RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1399 RegisterArg->setRegNum(RegNum);
1400 RegisterArg->setIsArg();
1401 Arg->setIsArg(false);
1402
1403 Args[i] = RegisterArg;
1404 // When not Om1, do the assignment through a temporary, instead of directly
1405 // from the pre-colored variable, so that a subsequent availabilityGet()
1406 // call has a chance to work. (In Om1, don't bother creating extra
1407 // instructions with extra variables to register-allocate.)
1408 if (OptM1) {
1409 Context.insert<InstAssign>(Arg, RegisterArg);
1410 } else {
1411 Variable *Tmp = makeReg(RegisterArg->getType());
1412 Context.insert<InstAssign>(Tmp, RegisterArg);
1413 Context.insert<InstAssign>(Arg, Tmp);
1414 }
1415 }
1416 if (!OptM1)
1417 Context.availabilityUpdate();
1418 }
1419
1420 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1421 /// narrower) for certain constants. The lea instruction can be used to multiply
1422 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1423 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1424 /// lea-based multiplies by 5, combined with left-shifting by 2.
1425
optimizeScalarMul(Variable * Dest,Operand * Src0,int32_t Src1)1426 bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
1427 int32_t Src1) {
1428 // Disable this optimization for Om1 and O0, just to keep things simple
1429 // there.
1430 if (Func->getOptLevel() < Opt_1)
1431 return false;
1432 Type Ty = Dest->getType();
1433 if (Src1 == -1) {
1434 Variable *T = nullptr;
1435 _mov(T, Src0);
1436 _neg(T);
1437 _mov(Dest, T);
1438 return true;
1439 }
1440 if (Src1 == 0) {
1441 _mov(Dest, Ctx->getConstantZero(Ty));
1442 return true;
1443 }
1444 if (Src1 == 1) {
1445 Variable *T = nullptr;
1446 _mov(T, Src0);
1447 _mov(Dest, T);
1448 return true;
1449 }
1450 // Don't bother with the edge case where Src1 == MININT.
1451 if (Src1 == -Src1)
1452 return false;
1453 const bool Src1IsNegative = Src1 < 0;
1454 if (Src1IsNegative)
1455 Src1 = -Src1;
1456 uint32_t Count9 = 0;
1457 uint32_t Count5 = 0;
1458 uint32_t Count3 = 0;
1459 uint32_t Count2 = 0;
1460 uint32_t CountOps = 0;
1461 while (Src1 > 1) {
1462 if (Src1 % 9 == 0) {
1463 ++CountOps;
1464 ++Count9;
1465 Src1 /= 9;
1466 } else if (Src1 % 5 == 0) {
1467 ++CountOps;
1468 ++Count5;
1469 Src1 /= 5;
1470 } else if (Src1 % 3 == 0) {
1471 ++CountOps;
1472 ++Count3;
1473 Src1 /= 3;
1474 } else if (Src1 % 2 == 0) {
1475 if (Count2 == 0)
1476 ++CountOps;
1477 ++Count2;
1478 Src1 /= 2;
1479 } else {
1480 return false;
1481 }
1482 }
1483 // Lea optimization only works for i16 and i32 types, not i8.
1484 if (Ty != IceType_i32 && (Count3 || Count5 || Count9))
1485 return false;
1486 // Limit the number of lea/shl operations for a single multiply, to a
1487 // somewhat arbitrary choice of 3.
1488 constexpr uint32_t MaxOpsForOptimizedMul = 3;
1489 if (CountOps > MaxOpsForOptimizedMul)
1490 return false;
1491 Variable *T = makeReg(WordType);
1492 if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1493 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
1494 _movzx(T, Src0RM);
1495 } else {
1496 _mov(T, Src0);
1497 }
1498 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1499 for (uint32_t i = 0; i < Count9; ++i) {
1500 constexpr uint16_t Shift = 3; // log2(9-1)
1501 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1502 }
1503 for (uint32_t i = 0; i < Count5; ++i) {
1504 constexpr uint16_t Shift = 2; // log2(5-1)
1505 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1506 }
1507 for (uint32_t i = 0; i < Count3; ++i) {
1508 constexpr uint16_t Shift = 1; // log2(3-1)
1509 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1510 }
1511 if (Count2) {
1512 _shl(T, Ctx->getConstantInt(Ty, Count2));
1513 }
1514 if (Src1IsNegative)
1515 _neg(T);
1516 _mov(Dest, T);
1517 return true;
1518 }
1519
lowerShift64(InstArithmetic::OpKind Op,Operand * Src0Lo,Operand * Src0Hi,Operand * Src1Lo,Variable * DestLo,Variable * DestHi)1520 void TargetX8632::lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo,
1521 Operand *Src0Hi, Operand *Src1Lo,
1522 Variable *DestLo, Variable *DestHi) {
1523 // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1524 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1525 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1526 Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1527 if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1528 uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1529 if (ShiftAmount > 32) {
1530 Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1531 switch (Op) {
1532 default:
1533 assert(0 && "non-shift op");
1534 break;
1535 case InstArithmetic::Shl: {
1536 // a=b<<c ==>
1537 // t2 = b.lo
1538 // t2 = shl t2, ShiftAmount-32
1539 // t3 = t2
1540 // t2 = 0
1541 _mov(T_2, Src0Lo);
1542 _shl(T_2, ReducedShift);
1543 _mov(DestHi, T_2);
1544 _mov(DestLo, Zero);
1545 } break;
1546 case InstArithmetic::Lshr: {
1547 // a=b>>c (unsigned) ==>
1548 // t2 = b.hi
1549 // t2 = shr t2, ShiftAmount-32
1550 // a.lo = t2
1551 // a.hi = 0
1552 _mov(T_2, Src0Hi);
1553 _shr(T_2, ReducedShift);
1554 _mov(DestLo, T_2);
1555 _mov(DestHi, Zero);
1556 } break;
1557 case InstArithmetic::Ashr: {
1558 // a=b>>c (signed) ==>
1559 // t3 = b.hi
1560 // t3 = sar t3, 0x1f
1561 // t2 = b.hi
1562 // t2 = shrd t2, t3, ShiftAmount-32
1563 // a.lo = t2
1564 // a.hi = t3
1565 _mov(T_3, Src0Hi);
1566 _sar(T_3, SignExtend);
1567 _mov(T_2, Src0Hi);
1568 _shrd(T_2, T_3, ReducedShift);
1569 _mov(DestLo, T_2);
1570 _mov(DestHi, T_3);
1571 } break;
1572 }
1573 } else if (ShiftAmount == 32) {
1574 switch (Op) {
1575 default:
1576 assert(0 && "non-shift op");
1577 break;
1578 case InstArithmetic::Shl: {
1579 // a=b<<c ==>
1580 // t2 = b.lo
1581 // a.hi = t2
1582 // a.lo = 0
1583 _mov(T_2, Src0Lo);
1584 _mov(DestHi, T_2);
1585 _mov(DestLo, Zero);
1586 } break;
1587 case InstArithmetic::Lshr: {
1588 // a=b>>c (unsigned) ==>
1589 // t2 = b.hi
1590 // a.lo = t2
1591 // a.hi = 0
1592 _mov(T_2, Src0Hi);
1593 _mov(DestLo, T_2);
1594 _mov(DestHi, Zero);
1595 } break;
1596 case InstArithmetic::Ashr: {
1597 // a=b>>c (signed) ==>
1598 // t2 = b.hi
1599 // a.lo = t2
1600 // t3 = b.hi
1601 // t3 = sar t3, 0x1f
1602 // a.hi = t3
1603 _mov(T_2, Src0Hi);
1604 _mov(DestLo, T_2);
1605 _mov(T_3, Src0Hi);
1606 _sar(T_3, SignExtend);
1607 _mov(DestHi, T_3);
1608 } break;
1609 }
1610 } else {
1611 // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1612 // t2 = b.lo
1613 // t3 = b.hi
1614 _mov(T_2, Src0Lo);
1615 _mov(T_3, Src0Hi);
1616 switch (Op) {
1617 default:
1618 assert(0 && "non-shift op");
1619 break;
1620 case InstArithmetic::Shl: {
1621 // a=b<<c ==>
1622 // t3 = shld t3, t2, ShiftAmount
1623 // t2 = shl t2, ShiftAmount
1624 _shld(T_3, T_2, ConstantShiftAmount);
1625 _shl(T_2, ConstantShiftAmount);
1626 } break;
1627 case InstArithmetic::Lshr: {
1628 // a=b>>c (unsigned) ==>
1629 // t2 = shrd t2, t3, ShiftAmount
1630 // t3 = shr t3, ShiftAmount
1631 _shrd(T_2, T_3, ConstantShiftAmount);
1632 _shr(T_3, ConstantShiftAmount);
1633 } break;
1634 case InstArithmetic::Ashr: {
1635 // a=b>>c (signed) ==>
1636 // t2 = shrd t2, t3, ShiftAmount
1637 // t3 = sar t3, ShiftAmount
1638 _shrd(T_2, T_3, ConstantShiftAmount);
1639 _sar(T_3, ConstantShiftAmount);
1640 } break;
1641 }
1642 // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1643 // a.lo = t2
1644 // a.hi = t3
1645 _mov(DestLo, T_2);
1646 _mov(DestHi, T_3);
1647 }
1648 } else {
1649 // NON-CONSTANT CASES.
1650 Constant *BitTest = Ctx->getConstantInt32(0x20);
1651 InstX86Label *Label = InstX86Label::create(Func, this);
1652 // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1653 // t1:ecx = c.lo & 0xff
1654 // t2 = b.lo
1655 // t3 = b.hi
1656 T_1 = copyToReg8(Src1Lo, RegX8632::Reg_cl);
1657 _mov(T_2, Src0Lo);
1658 _mov(T_3, Src0Hi);
1659 switch (Op) {
1660 default:
1661 assert(0 && "non-shift op");
1662 break;
1663 case InstArithmetic::Shl: {
1664 // a=b<<c ==>
1665 // t3 = shld t3, t2, t1
1666 // t2 = shl t2, t1
1667 // test t1, 0x20
1668 // je L1
1669 // use(t3)
1670 // t3 = t2
1671 // t2 = 0
1672 _shld(T_3, T_2, T_1);
1673 _shl(T_2, T_1);
1674 _test(T_1, BitTest);
1675 _br(CondX86::Br_e, Label);
1676 // T_2 and T_3 are being assigned again because of the intra-block control
1677 // flow, so we need to use _redefined to avoid liveness problems.
1678 _redefined(_mov(T_3, T_2));
1679 _redefined(_mov(T_2, Zero));
1680 } break;
1681 case InstArithmetic::Lshr: {
1682 // a=b>>c (unsigned) ==>
1683 // t2 = shrd t2, t3, t1
1684 // t3 = shr t3, t1
1685 // test t1, 0x20
1686 // je L1
1687 // use(t2)
1688 // t2 = t3
1689 // t3 = 0
1690 _shrd(T_2, T_3, T_1);
1691 _shr(T_3, T_1);
1692 _test(T_1, BitTest);
1693 _br(CondX86::Br_e, Label);
1694 // T_2 and T_3 are being assigned again because of the intra-block control
1695 // flow, so we need to use _redefined to avoid liveness problems.
1696 _redefined(_mov(T_2, T_3));
1697 _redefined(_mov(T_3, Zero));
1698 } break;
1699 case InstArithmetic::Ashr: {
1700 // a=b>>c (signed) ==>
1701 // t2 = shrd t2, t3, t1
1702 // t3 = sar t3, t1
1703 // test t1, 0x20
1704 // je L1
1705 // use(t2)
1706 // t2 = t3
1707 // t3 = sar t3, 0x1f
1708 Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1709 _shrd(T_2, T_3, T_1);
1710 _sar(T_3, T_1);
1711 _test(T_1, BitTest);
1712 _br(CondX86::Br_e, Label);
1713 // T_2 and T_3 are being assigned again because of the intra-block control
1714 // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1715 // doesn't need special treatment because it is reassigned via _sar
1716 // instead of _mov.
1717 _redefined(_mov(T_2, T_3));
1718 _sar(T_3, SignExtend);
1719 } break;
1720 }
1721 // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1722 // L1:
1723 // a.lo = t2
1724 // a.hi = t3
1725 Context.insert(Label);
1726 _mov(DestLo, T_2);
1727 _mov(DestHi, T_3);
1728 }
1729 }
1730
lowerArithmetic(const InstArithmetic * Instr)1731 void TargetX8632::lowerArithmetic(const InstArithmetic *Instr) {
1732 Variable *Dest = Instr->getDest();
1733 if (Dest->isRematerializable()) {
1734 Context.insert<InstFakeDef>(Dest);
1735 return;
1736 }
1737 Type Ty = Dest->getType();
1738 Operand *Src0 = legalize(Instr->getSrc(0));
1739 Operand *Src1 = legalize(Instr->getSrc(1));
1740 if (Instr->isCommutative()) {
1741 uint32_t SwapCount = 0;
1742 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1743 std::swap(Src0, Src1);
1744 ++SwapCount;
1745 }
1746 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1747 std::swap(Src0, Src1);
1748 ++SwapCount;
1749 }
1750 // Improve two-address code patterns by avoiding a copy to the dest
1751 // register when one of the source operands ends its lifetime here.
1752 if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1753 std::swap(Src0, Src1);
1754 ++SwapCount;
1755 }
1756 assert(SwapCount <= 1);
1757 (void)SwapCount;
1758 }
1759 if (Ty == IceType_i64) {
1760 // These x86-32 helper-call-involved instructions are lowered in this
1761 // separate switch. This is because loOperand() and hiOperand() may insert
1762 // redundant instructions for constant blinding and pooling. Such redundant
1763 // instructions will fail liveness analysis under -Om1 setting. And,
1764 // actually these arguments do not need to be processed with loOperand()
1765 // and hiOperand() to be used.
1766 switch (Instr->getOp()) {
1767 case InstArithmetic::Udiv:
1768 case InstArithmetic::Sdiv:
1769 case InstArithmetic::Urem:
1770 case InstArithmetic::Srem:
1771 llvm::report_fatal_error("Helper call was expected");
1772 return;
1773 default:
1774 break;
1775 }
1776
1777 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
1778 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
1779 Operand *Src0Lo = loOperand(Src0);
1780 Operand *Src0Hi = hiOperand(Src0);
1781 Operand *Src1Lo = loOperand(Src1);
1782 Operand *Src1Hi = hiOperand(Src1);
1783 Variable *T_Lo = nullptr, *T_Hi = nullptr;
1784 switch (Instr->getOp()) {
1785 case InstArithmetic::_num:
1786 llvm_unreachable("Unknown arithmetic operator");
1787 break;
1788 case InstArithmetic::Add:
1789 _mov(T_Lo, Src0Lo);
1790 _add(T_Lo, Src1Lo);
1791 _mov(DestLo, T_Lo);
1792 _mov(T_Hi, Src0Hi);
1793 _adc(T_Hi, Src1Hi);
1794 _mov(DestHi, T_Hi);
1795 break;
1796 case InstArithmetic::And:
1797 _mov(T_Lo, Src0Lo);
1798 _and(T_Lo, Src1Lo);
1799 _mov(DestLo, T_Lo);
1800 _mov(T_Hi, Src0Hi);
1801 _and(T_Hi, Src1Hi);
1802 _mov(DestHi, T_Hi);
1803 break;
1804 case InstArithmetic::Or:
1805 _mov(T_Lo, Src0Lo);
1806 _or(T_Lo, Src1Lo);
1807 _mov(DestLo, T_Lo);
1808 _mov(T_Hi, Src0Hi);
1809 _or(T_Hi, Src1Hi);
1810 _mov(DestHi, T_Hi);
1811 break;
1812 case InstArithmetic::Xor:
1813 _mov(T_Lo, Src0Lo);
1814 _xor(T_Lo, Src1Lo);
1815 _mov(DestLo, T_Lo);
1816 _mov(T_Hi, Src0Hi);
1817 _xor(T_Hi, Src1Hi);
1818 _mov(DestHi, T_Hi);
1819 break;
1820 case InstArithmetic::Sub:
1821 _mov(T_Lo, Src0Lo);
1822 _sub(T_Lo, Src1Lo);
1823 _mov(DestLo, T_Lo);
1824 _mov(T_Hi, Src0Hi);
1825 _sbb(T_Hi, Src1Hi);
1826 _mov(DestHi, T_Hi);
1827 break;
1828 case InstArithmetic::Mul: {
1829 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1830 Variable *T_4Lo = makeReg(IceType_i32, RegX8632::Reg_eax);
1831 Variable *T_4Hi = makeReg(IceType_i32, RegX8632::Reg_edx);
1832 // gcc does the following:
1833 // a=b*c ==>
1834 // t1 = b.hi; t1 *=(imul) c.lo
1835 // t2 = c.hi; t2 *=(imul) b.lo
1836 // t3:eax = b.lo
1837 // t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
1838 // a.lo = t4.lo
1839 // t4.hi += t1
1840 // t4.hi += t2
1841 // a.hi = t4.hi
1842 // The mul instruction cannot take an immediate operand.
1843 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
1844 _mov(T_1, Src0Hi);
1845 _imul(T_1, Src1Lo);
1846 _mov(T_3, Src0Lo, RegX8632::Reg_eax);
1847 _mul(T_4Lo, T_3, Src1Lo);
1848 // The mul instruction produces two dest variables, edx:eax. We create a
1849 // fake definition of edx to account for this.
1850 Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
1851 Context.insert<InstFakeUse>(T_4Hi);
1852 _mov(DestLo, T_4Lo);
1853 _add(T_4Hi, T_1);
1854 _mov(T_2, Src1Hi);
1855 Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
1856 _imul(T_2, Src0Lo);
1857 _add(T_4Hi, T_2);
1858 _mov(DestHi, T_4Hi);
1859 } break;
1860 case InstArithmetic::Shl:
1861 case InstArithmetic::Lshr:
1862 case InstArithmetic::Ashr:
1863 lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
1864 break;
1865 case InstArithmetic::Fadd:
1866 case InstArithmetic::Fsub:
1867 case InstArithmetic::Fmul:
1868 case InstArithmetic::Fdiv:
1869 case InstArithmetic::Frem:
1870 llvm_unreachable("FP instruction with i64 type");
1871 break;
1872 case InstArithmetic::Udiv:
1873 case InstArithmetic::Sdiv:
1874 case InstArithmetic::Urem:
1875 case InstArithmetic::Srem:
1876 llvm_unreachable("Call-helper-involved instruction for i64 type \
1877 should have already been handled before");
1878 break;
1879 }
1880 return;
1881 }
1882 if (isVectorType(Ty)) {
1883 // TODO: Trap on integer divide and integer modulo by zero. See:
1884 // https://code.google.com/p/nativeclient/issues/detail?id=3899
1885 if (llvm::isa<X86OperandMem>(Src1))
1886 Src1 = legalizeToReg(Src1);
1887 switch (Instr->getOp()) {
1888 case InstArithmetic::_num:
1889 llvm_unreachable("Unknown arithmetic operator");
1890 break;
1891 case InstArithmetic::Add: {
1892 Variable *T = makeReg(Ty);
1893 _movp(T, Src0);
1894 _padd(T, Src1);
1895 _movp(Dest, T);
1896 } break;
1897 case InstArithmetic::And: {
1898 Variable *T = makeReg(Ty);
1899 _movp(T, Src0);
1900 _pand(T, Src1);
1901 _movp(Dest, T);
1902 } break;
1903 case InstArithmetic::Or: {
1904 Variable *T = makeReg(Ty);
1905 _movp(T, Src0);
1906 _por(T, Src1);
1907 _movp(Dest, T);
1908 } break;
1909 case InstArithmetic::Xor: {
1910 Variable *T = makeReg(Ty);
1911 _movp(T, Src0);
1912 _pxor(T, Src1);
1913 _movp(Dest, T);
1914 } break;
1915 case InstArithmetic::Sub: {
1916 Variable *T = makeReg(Ty);
1917 _movp(T, Src0);
1918 _psub(T, Src1);
1919 _movp(Dest, T);
1920 } break;
1921 case InstArithmetic::Mul: {
1922 bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
1923 bool InstructionSetIsValidForPmull =
1924 Ty == IceType_v8i16 || InstructionSet >= SSE4_1;
1925 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
1926 Variable *T = makeReg(Ty);
1927 _movp(T, Src0);
1928 _pmull(T, Src0 == Src1 ? T : Src1);
1929 _movp(Dest, T);
1930 } else if (Ty == IceType_v4i32) {
1931 // Lowering sequence:
1932 // Note: The mask arguments have index 0 on the left.
1933 //
1934 // movups T1, Src0
1935 // pshufd T2, Src0, {1,0,3,0}
1936 // pshufd T3, Src1, {1,0,3,0}
1937 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
1938 // pmuludq T1, Src1
1939 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
1940 // pmuludq T2, T3
1941 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
1942 // shufps T1, T2, {0,2,0,2}
1943 // pshufd T4, T1, {0,2,1,3}
1944 // movups Dest, T4
1945
1946 // Mask that directs pshufd to create a vector with entries
1947 // Src[1, 0, 3, 0]
1948 constexpr unsigned Constant1030 = 0x31;
1949 Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
1950 // Mask that directs shufps to create a vector with entries
1951 // Dest[0, 2], Src[0, 2]
1952 constexpr unsigned Mask0202 = 0x88;
1953 // Mask that directs pshufd to create a vector with entries
1954 // Src[0, 2, 1, 3]
1955 constexpr unsigned Mask0213 = 0xd8;
1956 Variable *T1 = makeReg(IceType_v4i32);
1957 Variable *T2 = makeReg(IceType_v4i32);
1958 Variable *T3 = makeReg(IceType_v4i32);
1959 Variable *T4 = makeReg(IceType_v4i32);
1960 _movp(T1, Src0);
1961 _pshufd(T2, Src0, Mask1030);
1962 _pshufd(T3, Src1, Mask1030);
1963 _pmuludq(T1, Src1);
1964 _pmuludq(T2, T3);
1965 _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
1966 _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
1967 _movp(Dest, T4);
1968 } else if (Ty == IceType_v16i8) {
1969 llvm::report_fatal_error("Scalarized operation was expected");
1970 } else {
1971 llvm::report_fatal_error("Invalid vector multiply type");
1972 }
1973 } break;
1974 case InstArithmetic::Shl: {
1975 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1976 Variable *T = makeReg(Ty);
1977 _movp(T, Src0);
1978 _psll(T, Src1);
1979 _movp(Dest, T);
1980 } break;
1981 case InstArithmetic::Lshr: {
1982 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1983 Variable *T = makeReg(Ty);
1984 _movp(T, Src0);
1985 _psrl(T, Src1);
1986 _movp(Dest, T);
1987 } break;
1988 case InstArithmetic::Ashr: {
1989 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1990 Variable *T = makeReg(Ty);
1991 _movp(T, Src0);
1992 _psra(T, Src1);
1993 _movp(Dest, T);
1994 } break;
1995 case InstArithmetic::Udiv:
1996 case InstArithmetic::Urem:
1997 case InstArithmetic::Sdiv:
1998 case InstArithmetic::Srem:
1999 llvm::report_fatal_error("Scalarized operation was expected");
2000 break;
2001 case InstArithmetic::Fadd: {
2002 Variable *T = makeReg(Ty);
2003 _movp(T, Src0);
2004 _addps(T, Src1);
2005 _movp(Dest, T);
2006 } break;
2007 case InstArithmetic::Fsub: {
2008 Variable *T = makeReg(Ty);
2009 _movp(T, Src0);
2010 _subps(T, Src1);
2011 _movp(Dest, T);
2012 } break;
2013 case InstArithmetic::Fmul: {
2014 Variable *T = makeReg(Ty);
2015 _movp(T, Src0);
2016 _mulps(T, Src0 == Src1 ? T : Src1);
2017 _movp(Dest, T);
2018 } break;
2019 case InstArithmetic::Fdiv: {
2020 Variable *T = makeReg(Ty);
2021 _movp(T, Src0);
2022 _divps(T, Src1);
2023 _movp(Dest, T);
2024 } break;
2025 case InstArithmetic::Frem:
2026 llvm::report_fatal_error("Scalarized operation was expected");
2027 break;
2028 }
2029 return;
2030 }
2031 Variable *T_edx = nullptr;
2032 Variable *T = nullptr;
2033 switch (Instr->getOp()) {
2034 case InstArithmetic::_num:
2035 llvm_unreachable("Unknown arithmetic operator");
2036 break;
2037 case InstArithmetic::Add: {
2038 const bool ValidType = Ty == IceType_i32;
2039 auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
2040 const bool ValidKind =
2041 Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
2042 llvm::isa<ConstantRelocatable>(Const));
2043 if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
2044 auto *Var = legalizeToReg(Src0);
2045 auto *Mem = X86OperandMem::create(Func, IceType_void, Var, Const);
2046 T = makeReg(Ty);
2047 _lea(T, Mem);
2048 _mov(Dest, T);
2049 break;
2050 }
2051 _mov(T, Src0);
2052 _add(T, Src1);
2053 _mov(Dest, T);
2054 } break;
2055 case InstArithmetic::And:
2056 _mov(T, Src0);
2057 _and(T, Src1);
2058 _mov(Dest, T);
2059 break;
2060 case InstArithmetic::Or:
2061 _mov(T, Src0);
2062 _or(T, Src1);
2063 _mov(Dest, T);
2064 break;
2065 case InstArithmetic::Xor:
2066 _mov(T, Src0);
2067 _xor(T, Src1);
2068 _mov(Dest, T);
2069 break;
2070 case InstArithmetic::Sub:
2071 _mov(T, Src0);
2072 _sub(T, Src1);
2073 _mov(Dest, T);
2074 break;
2075 case InstArithmetic::Mul:
2076 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2077 if (optimizeScalarMul(Dest, Src0, C->getValue()))
2078 return;
2079 }
2080 // The 8-bit version of imul only allows the form "imul r/m8" where T must
2081 // be in al.
2082 if (isByteSizedArithType(Ty)) {
2083 _mov(T, Src0, RegX8632::Reg_al);
2084 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2085 _imul(T, Src0 == Src1 ? T : Src1);
2086 _mov(Dest, T);
2087 } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2088 T = makeReg(Ty);
2089 Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
2090 _imul_imm(T, Src0, ImmConst);
2091 _mov(Dest, T);
2092 } else {
2093 _mov(T, Src0);
2094 // No need to legalize Src1 to Reg | Mem because the Imm case is handled
2095 // already by the ConstantInteger32 case above.
2096 _imul(T, Src0 == Src1 ? T : Src1);
2097 _mov(Dest, T);
2098 }
2099 break;
2100 case InstArithmetic::Shl:
2101 _mov(T, Src0);
2102 if (!llvm::isa<ConstantInteger32>(Src1) &&
2103 !llvm::isa<ConstantInteger64>(Src1))
2104 Src1 = copyToReg8(Src1, RegX8632::Reg_cl);
2105 _shl(T, Src1);
2106 _mov(Dest, T);
2107 break;
2108 case InstArithmetic::Lshr:
2109 _mov(T, Src0);
2110 if (!llvm::isa<ConstantInteger32>(Src1) &&
2111 !llvm::isa<ConstantInteger64>(Src1))
2112 Src1 = copyToReg8(Src1, RegX8632::Reg_cl);
2113 _shr(T, Src1);
2114 _mov(Dest, T);
2115 break;
2116 case InstArithmetic::Ashr:
2117 _mov(T, Src0);
2118 if (!llvm::isa<ConstantInteger32>(Src1) &&
2119 !llvm::isa<ConstantInteger64>(Src1))
2120 Src1 = copyToReg8(Src1, RegX8632::Reg_cl);
2121 _sar(T, Src1);
2122 _mov(Dest, T);
2123 break;
2124 case InstArithmetic::Udiv: {
2125 // div and idiv are the few arithmetic operators that do not allow
2126 // immediates as the operand.
2127 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2128 RegNumT Eax;
2129 RegNumT Edx;
2130 switch (Ty) {
2131 default:
2132 llvm::report_fatal_error("Bad type for udiv");
2133 case IceType_i32:
2134 Eax = RegX8632::Reg_eax;
2135 Edx = RegX8632::Reg_edx;
2136 break;
2137 case IceType_i16:
2138 Eax = RegX8632::Reg_ax;
2139 Edx = RegX8632::Reg_dx;
2140 break;
2141 case IceType_i8:
2142 Eax = RegX8632::Reg_al;
2143 Edx = RegX8632::Reg_ah;
2144 break;
2145 }
2146 T_edx = makeReg(Ty, Edx);
2147 _mov(T, Src0, Eax);
2148 _mov(T_edx, Ctx->getConstantZero(Ty));
2149 _div(T_edx, Src1, T);
2150 _redefined(Context.insert<InstFakeDef>(T, T_edx));
2151 _mov(Dest, T);
2152 } break;
2153 case InstArithmetic::Sdiv:
2154 // TODO(stichnot): Enable this after doing better performance and cross
2155 // testing.
2156 if (false && Func->getOptLevel() >= Opt_1) {
2157 // Optimize division by constant power of 2, but not for Om1 or O0, just
2158 // to keep things simple there.
2159 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2160 const int32_t Divisor = C->getValue();
2161 const uint32_t UDivisor = Divisor;
2162 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2163 uint32_t LogDiv = llvm::Log2_32(UDivisor);
2164 // LLVM does the following for dest=src/(1<<log):
2165 // t=src
2166 // sar t,typewidth-1 // -1 if src is negative, 0 if not
2167 // shr t,typewidth-log
2168 // add t,src
2169 // sar t,log
2170 // dest=t
2171 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
2172 _mov(T, Src0);
2173 // If for some reason we are dividing by 1, just treat it like an
2174 // assignment.
2175 if (LogDiv > 0) {
2176 // The initial sar is unnecessary when dividing by 2.
2177 if (LogDiv > 1)
2178 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2179 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2180 _add(T, Src0);
2181 _sar(T, Ctx->getConstantInt(Ty, LogDiv));
2182 }
2183 _mov(Dest, T);
2184 return;
2185 }
2186 }
2187 }
2188 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2189 switch (Ty) {
2190 default:
2191 llvm::report_fatal_error("Bad type for sdiv");
2192 case IceType_i32:
2193 T_edx = makeReg(Ty, RegX8632::Reg_edx);
2194 _mov(T, Src0, RegX8632::Reg_eax);
2195 break;
2196 case IceType_i16:
2197 T_edx = makeReg(Ty, RegX8632::Reg_dx);
2198 _mov(T, Src0, RegX8632::Reg_ax);
2199 break;
2200 case IceType_i8:
2201 T_edx = makeReg(IceType_i16, RegX8632::Reg_ax);
2202 _mov(T, Src0, RegX8632::Reg_al);
2203 break;
2204 }
2205 _cbwdq(T_edx, T);
2206 _idiv(T_edx, Src1, T);
2207 _redefined(Context.insert<InstFakeDef>(T, T_edx));
2208 _mov(Dest, T);
2209 break;
2210 case InstArithmetic::Urem: {
2211 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2212 RegNumT Eax;
2213 RegNumT Edx;
2214 switch (Ty) {
2215 default:
2216 llvm::report_fatal_error("Bad type for urem");
2217 case IceType_i32:
2218 Eax = RegX8632::Reg_eax;
2219 Edx = RegX8632::Reg_edx;
2220 break;
2221 case IceType_i16:
2222 Eax = RegX8632::Reg_ax;
2223 Edx = RegX8632::Reg_dx;
2224 break;
2225 case IceType_i8:
2226 Eax = RegX8632::Reg_al;
2227 Edx = RegX8632::Reg_ah;
2228 break;
2229 }
2230 T_edx = makeReg(Ty, Edx);
2231 _mov(T_edx, Ctx->getConstantZero(Ty));
2232 _mov(T, Src0, Eax);
2233 _div(T, Src1, T_edx);
2234 _redefined(Context.insert<InstFakeDef>(T_edx, T));
2235 if (Ty == IceType_i8) {
2236 // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2237 // moved into a general 8-bit register.
2238 auto *T_AhRcvr = makeReg(Ty);
2239 T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2240 _mov(T_AhRcvr, T_edx);
2241 T_edx = T_AhRcvr;
2242 }
2243 _mov(Dest, T_edx);
2244 } break;
2245 case InstArithmetic::Srem: {
2246 // TODO(stichnot): Enable this after doing better performance and cross
2247 // testing.
2248 if (false && Func->getOptLevel() >= Opt_1) {
2249 // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2250 // keep things simple there.
2251 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2252 const int32_t Divisor = C->getValue();
2253 const uint32_t UDivisor = Divisor;
2254 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2255 uint32_t LogDiv = llvm::Log2_32(UDivisor);
2256 // LLVM does the following for dest=src%(1<<log):
2257 // t=src
2258 // sar t,typewidth-1 // -1 if src is negative, 0 if not
2259 // shr t,typewidth-log
2260 // add t,src
2261 // and t, -(1<<log)
2262 // sub t,src
2263 // neg t
2264 // dest=t
2265 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
2266 // If for some reason we are dividing by 1, just assign 0.
2267 if (LogDiv == 0) {
2268 _mov(Dest, Ctx->getConstantZero(Ty));
2269 return;
2270 }
2271 _mov(T, Src0);
2272 // The initial sar is unnecessary when dividing by 2.
2273 if (LogDiv > 1)
2274 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2275 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2276 _add(T, Src0);
2277 _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2278 _sub(T, Src0);
2279 _neg(T);
2280 _mov(Dest, T);
2281 return;
2282 }
2283 }
2284 }
2285 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2286 RegNumT Eax;
2287 RegNumT Edx;
2288 switch (Ty) {
2289 default:
2290 llvm::report_fatal_error("Bad type for srem");
2291 case IceType_i32:
2292 Eax = RegX8632::Reg_eax;
2293 Edx = RegX8632::Reg_edx;
2294 break;
2295 case IceType_i16:
2296 Eax = RegX8632::Reg_ax;
2297 Edx = RegX8632::Reg_dx;
2298 break;
2299 case IceType_i8:
2300 Eax = RegX8632::Reg_al;
2301 Edx = RegX8632::Reg_ah;
2302 break;
2303 }
2304 T_edx = makeReg(Ty, Edx);
2305 _mov(T, Src0, Eax);
2306 _cbwdq(T_edx, T);
2307 _idiv(T, Src1, T_edx);
2308 _redefined(Context.insert<InstFakeDef>(T_edx, T));
2309 if (Ty == IceType_i8) {
2310 // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2311 // moved into a general 8-bit register.
2312 auto *T_AhRcvr = makeReg(Ty);
2313 T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2314 _mov(T_AhRcvr, T_edx);
2315 T_edx = T_AhRcvr;
2316 }
2317 _mov(Dest, T_edx);
2318 } break;
2319 case InstArithmetic::Fadd:
2320 _mov(T, Src0);
2321 _addss(T, Src1);
2322 _mov(Dest, T);
2323 break;
2324 case InstArithmetic::Fsub:
2325 _mov(T, Src0);
2326 _subss(T, Src1);
2327 _mov(Dest, T);
2328 break;
2329 case InstArithmetic::Fmul:
2330 _mov(T, Src0);
2331 _mulss(T, Src0 == Src1 ? T : Src1);
2332 _mov(Dest, T);
2333 break;
2334 case InstArithmetic::Fdiv:
2335 _mov(T, Src0);
2336 _divss(T, Src1);
2337 _mov(Dest, T);
2338 break;
2339 case InstArithmetic::Frem:
2340 llvm::report_fatal_error("Helper call was expected");
2341 break;
2342 }
2343 }
2344
lowerAssign(const InstAssign * Instr)2345 void TargetX8632::lowerAssign(const InstAssign *Instr) {
2346 Variable *Dest = Instr->getDest();
2347 if (Dest->isRematerializable()) {
2348 Context.insert<InstFakeDef>(Dest);
2349 return;
2350 }
2351 Operand *Src = Instr->getSrc(0);
2352 assert(Dest->getType() == Src->getType());
2353 lowerMove(Dest, Src, false);
2354 }
2355
lowerBr(const InstBr * Br)2356 void TargetX8632::lowerBr(const InstBr *Br) {
2357 if (Br->isUnconditional()) {
2358 _br(Br->getTargetUnconditional());
2359 return;
2360 }
2361 Operand *Cond = Br->getCondition();
2362
2363 // Handle folding opportunities.
2364 if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2365 assert(Producer->isDeleted());
2366 switch (BoolFolding::getProducerKind(Producer)) {
2367 default:
2368 break;
2369 case BoolFolding::PK_Icmp32:
2370 case BoolFolding::PK_Icmp64: {
2371 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2372 return;
2373 }
2374 case BoolFolding::PK_Fcmp: {
2375 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2376 return;
2377 }
2378 case BoolFolding::PK_Arith: {
2379 lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2380 return;
2381 }
2382 }
2383 }
2384 Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2385 Constant *Zero = Ctx->getConstantZero(IceType_i32);
2386 _cmp(Src0, Zero);
2387 _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2388 }
2389
2390 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2391 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
constexprMax(SizeT S0,SizeT S1)2392 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2393 return S0 < S1 ? S1 : S0;
2394 }
2395
lowerCall(const InstCall * Instr)2396 void TargetX8632::lowerCall(const InstCall *Instr) {
2397 // System V x86-32 calling convention lowering:
2398 //
2399 // * At the point before the call, the stack must be aligned to 16 bytes.
2400 //
2401 // * Non-register arguments are pushed onto the stack in right-to-left order,
2402 // such that the left-most argument ends up on the top of the stack at the
2403 // lowest memory address.
2404 //
2405 // * Stack arguments of vector type are aligned to start at the next highest
2406 // multiple of 16 bytes. Other stack arguments are aligned to the next word
2407 // size boundary (4 or 8 bytes, respectively).
2408 //
2409 // This is compatible with the Microsoft x86-32 'cdecl' calling convention,
2410 // which doesn't have a 16-byte stack alignment requirement.
2411
2412 RequiredStackAlignment =
2413 std::max<size_t>(RequiredStackAlignment, X86_STACK_ALIGNMENT_BYTES);
2414
2415 constexpr SizeT MaxOperands =
2416 constexprMax(RegX8632::X86_MAX_XMM_ARGS, RegX8632::X86_MAX_GPR_ARGS);
2417 using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
2418
2419 OperandList XmmArgs;
2420 llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
2421 CfgVector<std::pair<const Type, Operand *>> GprArgs;
2422 CfgVector<SizeT> GprArgIndices;
2423 OperandList StackArgs, StackArgLocations;
2424 uint32_t ParameterAreaSizeBytes = 0;
2425
2426 // Classify each argument operand according to the location where the argument
2427 // is passed.
2428 for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2429 Operand *Arg = Instr->getArg(i);
2430 const Type Ty = Arg->getType();
2431 // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2432 assert(typeWidthInBytes(Ty) >= 4);
2433 if (isVectorType(Ty) && RegX8632::getRegisterForXmmArgNum(
2434 RegX8632::getArgIndex(i, XmmArgs.size()))
2435 .hasValue()) {
2436 XmmArgs.push_back(Arg);
2437 XmmArgIndices.push_back(i);
2438 } else if (isScalarIntegerType(Ty) &&
2439 RegX8632::getRegisterForGprArgNum(
2440 Ty, RegX8632::getArgIndex(i, GprArgs.size()))
2441 .hasValue()) {
2442 GprArgs.emplace_back(Ty, Arg);
2443 GprArgIndices.push_back(i);
2444 } else {
2445 // Place on stack.
2446 StackArgs.push_back(Arg);
2447 if (isVectorType(Arg->getType())) {
2448 ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2449 }
2450 Variable *esp = getPhysicalRegister(getStackReg(), WordType);
2451 Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2452 StackArgLocations.push_back(X86OperandMem::create(Func, Ty, esp, Loc));
2453 ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2454 }
2455 }
2456 // Ensure there is enough space for the fstp/movs for floating returns.
2457 Variable *Dest = Instr->getDest();
2458 const Type DestTy = Dest ? Dest->getType() : IceType_void;
2459 if (isScalarFloatingType(DestTy)) {
2460 ParameterAreaSizeBytes =
2461 std::max(static_cast<size_t>(ParameterAreaSizeBytes),
2462 typeWidthInBytesOnStack(DestTy));
2463 }
2464 // Adjust the parameter area so that the stack is aligned. It is assumed that
2465 // the stack is already aligned at the start of the calling sequence.
2466 ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2467 assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2468 // Copy arguments that are passed on the stack to the appropriate stack
2469 // locations. We make sure legalize() is called on each argument at this
2470 // point, to allow availabilityGet() to work.
2471 for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2472 lowerStore(
2473 InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2474 }
2475 // Copy arguments to be passed in registers to the appropriate registers.
2476 for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2477 XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
2478 RegX8632::getRegisterForXmmArgNum(
2479 RegX8632::getArgIndex(XmmArgIndices[i], i)));
2480 }
2481 // Materialize moves for arguments passed in GPRs.
2482 for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2483 const Type SignatureTy = GprArgs[i].first;
2484 Operand *Arg =
2485 legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2486 GprArgs[i].second = legalizeToReg(
2487 Arg, RegX8632::getRegisterForGprArgNum(
2488 Arg->getType(), RegX8632::getArgIndex(GprArgIndices[i], i)));
2489 assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2490 assert(SignatureTy == Arg->getType());
2491 (void)SignatureTy;
2492 }
2493 // Generate a FakeUse of register arguments so that they do not get dead code
2494 // eliminated as a result of the FakeKill of scratch registers after the call.
2495 // These need to be right before the call instruction.
2496 for (auto *Arg : XmmArgs) {
2497 Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2498 }
2499 for (auto &ArgPair : GprArgs) {
2500 Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2501 }
2502 // Generate the call instruction. Assign its result to a temporary with high
2503 // register allocation weight.
2504 // ReturnReg doubles as ReturnRegLo as necessary.
2505 Variable *ReturnReg = nullptr;
2506 Variable *ReturnRegHi = nullptr;
2507 if (Dest) {
2508 switch (DestTy) {
2509 case IceType_NUM:
2510 case IceType_void:
2511 case IceType_i1:
2512 case IceType_i8:
2513 case IceType_i16:
2514 llvm::report_fatal_error("Invalid Call dest type");
2515 break;
2516 case IceType_i32:
2517 ReturnReg = makeReg(DestTy, RegX8632::Reg_eax);
2518 break;
2519 case IceType_i64:
2520 ReturnReg = makeReg(IceType_i32, RegX8632::Reg_eax);
2521 ReturnRegHi = makeReg(IceType_i32, RegX8632::Reg_edx);
2522 break;
2523 case IceType_f32:
2524 case IceType_f64:
2525 // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
2526 // the fstp instruction.
2527 break;
2528 // Fallthrough intended.
2529 case IceType_v4i1:
2530 case IceType_v8i1:
2531 case IceType_v16i1:
2532 case IceType_v16i8:
2533 case IceType_v8i16:
2534 case IceType_v4i32:
2535 case IceType_v4f32:
2536 ReturnReg = makeReg(DestTy, RegX8632::Reg_xmm0);
2537 break;
2538 }
2539 }
2540 // Emit the call to the function.
2541 Operand *CallTarget =
2542 legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2543 size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
2544 Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
2545 // Keep the upper return register live on 32-bit platform.
2546 if (ReturnRegHi)
2547 Context.insert<InstFakeDef>(ReturnRegHi);
2548 // Mark the call as killing all the caller-save registers.
2549 Context.insert<InstFakeKill>(NewCall);
2550 // Handle x86-32 floating point returns.
2551 if (Dest != nullptr && isScalarFloatingType(DestTy)) {
2552 // Special treatment for an FP function which returns its result in st(0).
2553 // If Dest ends up being a physical xmm register, the fstp emit code will
2554 // route st(0) through the space reserved in the function argument area
2555 // we allocated.
2556 _fstp(Dest);
2557 // Create a fake use of Dest in case it actually isn't used, because st(0)
2558 // still needs to be popped.
2559 Context.insert<InstFakeUse>(Dest);
2560 }
2561 // Generate a FakeUse to keep the call live if necessary.
2562 if (Instr->hasSideEffects() && ReturnReg) {
2563 Context.insert<InstFakeUse>(ReturnReg);
2564 }
2565 // Process the return value, if any.
2566 if (Dest == nullptr)
2567 return;
2568 // Assign the result of the call to Dest. Route it through a temporary so
2569 // that the local register availability peephole can be subsequently used.
2570 Variable *Tmp = nullptr;
2571 if (isVectorType(DestTy)) {
2572 assert(ReturnReg && "Vector type requires a return register");
2573 Tmp = makeReg(DestTy);
2574 _movp(Tmp, ReturnReg);
2575 _movp(Dest, Tmp);
2576 } else if (!isScalarFloatingType(DestTy)) {
2577 assert(isScalarIntegerType(DestTy));
2578 assert(ReturnReg && "Integer type requires a return register");
2579 if (DestTy == IceType_i64) {
2580 assert(ReturnRegHi && "64-bit type requires two return registers");
2581 auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
2582 Variable *DestLo = Dest64On32->getLo();
2583 Variable *DestHi = Dest64On32->getHi();
2584 _mov(Tmp, ReturnReg);
2585 _mov(DestLo, Tmp);
2586 Variable *TmpHi = nullptr;
2587 _mov(TmpHi, ReturnRegHi);
2588 _mov(DestHi, TmpHi);
2589 } else {
2590 _mov(Tmp, ReturnReg);
2591 _mov(Dest, Tmp);
2592 }
2593 }
2594 }
2595
lowerCast(const InstCast * Instr)2596 void TargetX8632::lowerCast(const InstCast *Instr) {
2597 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2598 InstCast::OpKind CastKind = Instr->getCastKind();
2599 Variable *Dest = Instr->getDest();
2600 Type DestTy = Dest->getType();
2601 switch (CastKind) {
2602 default:
2603 Func->setError("Cast type not supported");
2604 return;
2605 case InstCast::Sext: {
2606 // Src0RM is the source operand legalized to physical register or memory,
2607 // but not immediate, since the relevant x86 native instructions don't
2608 // allow an immediate operand. If the operand is an immediate, we could
2609 // consider computing the strength-reduced result at translation time, but
2610 // we're unlikely to see something like that in the bitcode that the
2611 // optimizer wouldn't have already taken care of.
2612 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2613 if (isVectorType(DestTy)) {
2614 if (DestTy == IceType_v16i8) {
2615 // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2616 Variable *OneMask = makeVectorOfOnes(DestTy);
2617 Variable *T = makeReg(DestTy);
2618 _movp(T, Src0RM);
2619 _pand(T, OneMask);
2620 Variable *Zeros = makeVectorOfZeros(DestTy);
2621 _pcmpgt(T, Zeros);
2622 _movp(Dest, T);
2623 } else {
2624 /// width = width(elty) - 1; dest = (src << width) >> width
2625 SizeT ShiftAmount =
2626 X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
2627 Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2628 Variable *T = makeReg(DestTy);
2629 _movp(T, Src0RM);
2630 _psll(T, ShiftConstant);
2631 _psra(T, ShiftConstant);
2632 _movp(Dest, T);
2633 }
2634 } else if (DestTy == IceType_i64) {
2635 // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
2636 Constant *Shift = Ctx->getConstantInt32(31);
2637 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2638 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2639 Variable *T_Lo = makeReg(DestLo->getType());
2640 if (Src0RM->getType() == IceType_i32) {
2641 _mov(T_Lo, Src0RM);
2642 } else if (Src0RM->getType() == IceType_i1) {
2643 _movzx(T_Lo, Src0RM);
2644 _shl(T_Lo, Shift);
2645 _sar(T_Lo, Shift);
2646 } else {
2647 _movsx(T_Lo, Src0RM);
2648 }
2649 _mov(DestLo, T_Lo);
2650 Variable *T_Hi = nullptr;
2651 _mov(T_Hi, T_Lo);
2652 if (Src0RM->getType() != IceType_i1)
2653 // For i1, the sar instruction is already done above.
2654 _sar(T_Hi, Shift);
2655 _mov(DestHi, T_Hi);
2656 } else if (Src0RM->getType() == IceType_i1) {
2657 // t1 = src
2658 // shl t1, dst_bitwidth - 1
2659 // sar t1, dst_bitwidth - 1
2660 // dst = t1
2661 size_t DestBits = X86_CHAR_BIT * typeWidthInBytes(DestTy);
2662 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2663 Variable *T = makeReg(DestTy);
2664 if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2665 _mov(T, Src0RM);
2666 } else {
2667 // Widen the source using movsx or movzx. (It doesn't matter which one,
2668 // since the following shl/sar overwrite the bits.)
2669 _movzx(T, Src0RM);
2670 }
2671 _shl(T, ShiftAmount);
2672 _sar(T, ShiftAmount);
2673 _mov(Dest, T);
2674 } else {
2675 // t1 = movsx src; dst = t1
2676 Variable *T = makeReg(DestTy);
2677 _movsx(T, Src0RM);
2678 _mov(Dest, T);
2679 }
2680 break;
2681 }
2682 case InstCast::Zext: {
2683 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2684 if (isVectorType(DestTy)) {
2685 // onemask = materialize(1,1,...); dest = onemask & src
2686 Variable *OneMask = makeVectorOfOnes(DestTy);
2687 Variable *T = makeReg(DestTy);
2688 _movp(T, Src0RM);
2689 _pand(T, OneMask);
2690 _movp(Dest, T);
2691 } else if (DestTy == IceType_i64) {
2692 // t1=movzx src; dst.lo=t1; dst.hi=0
2693 Constant *Zero = Ctx->getConstantZero(IceType_i32);
2694 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2695 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2696 Variable *Tmp = makeReg(DestLo->getType());
2697 if (Src0RM->getType() == IceType_i32) {
2698 _mov(Tmp, Src0RM);
2699 } else {
2700 _movzx(Tmp, Src0RM);
2701 }
2702 _mov(DestLo, Tmp);
2703 _mov(DestHi, Zero);
2704 } else if (Src0RM->getType() == IceType_i1) {
2705 // t = Src0RM; Dest = t
2706 Variable *T = nullptr;
2707 if (DestTy == IceType_i8) {
2708 _mov(T, Src0RM);
2709 } else {
2710 assert(DestTy != IceType_i1);
2711 assert(DestTy != IceType_i64);
2712 // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2713 // In x86-64 we need to widen T to 64-bits to ensure that T -- if
2714 // written to the stack (i.e., in -Om1) will be fully zero-extended.
2715 T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
2716 _movzx(T, Src0RM);
2717 }
2718 _mov(Dest, T);
2719 } else {
2720 // t1 = movzx src; dst = t1
2721 Variable *T = makeReg(DestTy);
2722 _movzx(T, Src0RM);
2723 _mov(Dest, T);
2724 }
2725 break;
2726 }
2727 case InstCast::Trunc: {
2728 if (isVectorType(DestTy)) {
2729 // onemask = materialize(1,1,...); dst = src & onemask
2730 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2731 Type Src0Ty = Src0RM->getType();
2732 Variable *OneMask = makeVectorOfOnes(Src0Ty);
2733 Variable *T = makeReg(DestTy);
2734 _movp(T, Src0RM);
2735 _pand(T, OneMask);
2736 _movp(Dest, T);
2737 } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
2738 // Make sure we truncate from and into valid registers.
2739 Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2740 if (Src0->getType() == IceType_i64)
2741 Src0 = loOperand(Src0);
2742 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2743 Variable *T = copyToReg8(Src0RM);
2744 if (DestTy == IceType_i1)
2745 _and(T, Ctx->getConstantInt1(1));
2746 _mov(Dest, T);
2747 } else {
2748 Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2749 if (Src0->getType() == IceType_i64)
2750 Src0 = loOperand(Src0);
2751 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2752 // t1 = trunc Src0RM; Dest = t1
2753 Variable *T = makeReg(DestTy);
2754 _mov(T, Src0RM);
2755 _mov(Dest, T);
2756 }
2757 break;
2758 }
2759 case InstCast::Fptrunc:
2760 case InstCast::Fpext: {
2761 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2762 // t1 = cvt Src0RM; Dest = t1
2763 Variable *T = makeReg(DestTy);
2764 _cvt(T, Src0RM, Insts::Cvt::Float2float);
2765 _mov(Dest, T);
2766 break;
2767 }
2768 case InstCast::Fptosi:
2769 if (isVectorType(DestTy)) {
2770 assert(DestTy == IceType_v4i32);
2771 assert(Instr->getSrc(0)->getType() == IceType_v4f32);
2772 Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2773 Variable *T = makeReg(DestTy);
2774 _cvt(T, Src0R, Insts::Cvt::Tps2dq);
2775 _movp(Dest, T);
2776 } else if (DestTy == IceType_i64) {
2777 llvm::report_fatal_error("Helper call was expected");
2778 } else {
2779 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2780 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2781 Variable *T_1 = nullptr;
2782 assert(DestTy != IceType_i64);
2783 T_1 = makeReg(IceType_i32);
2784 // cvt() requires its integer argument to be a GPR.
2785 Variable *T_2 = makeReg(DestTy);
2786 if (isByteSizedType(DestTy)) {
2787 assert(T_1->getType() == IceType_i32);
2788 T_1->setRegClass(RCX86_Is32To8);
2789 T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2790 }
2791 _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2792 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2793 if (DestTy == IceType_i1)
2794 _and(T_2, Ctx->getConstantInt1(1));
2795 _mov(Dest, T_2);
2796 }
2797 break;
2798 case InstCast::Fptoui:
2799 if (isVectorType(DestTy)) {
2800 llvm::report_fatal_error("Helper call was expected");
2801 } else if (DestTy == IceType_i64 || DestTy == IceType_i32) {
2802 llvm::report_fatal_error("Helper call was expected");
2803 } else {
2804 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2805 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2806 assert(DestTy != IceType_i64);
2807 Variable *T_1 = nullptr;
2808 assert(DestTy != IceType_i32);
2809 T_1 = makeReg(IceType_i32);
2810 Variable *T_2 = makeReg(DestTy);
2811 if (isByteSizedType(DestTy)) {
2812 assert(T_1->getType() == IceType_i32);
2813 T_1->setRegClass(RCX86_Is32To8);
2814 T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2815 }
2816 _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2817 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2818 if (DestTy == IceType_i1)
2819 _and(T_2, Ctx->getConstantInt1(1));
2820 _mov(Dest, T_2);
2821 }
2822 break;
2823 case InstCast::Sitofp:
2824 if (isVectorType(DestTy)) {
2825 assert(DestTy == IceType_v4f32);
2826 assert(Instr->getSrc(0)->getType() == IceType_v4i32);
2827 Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2828 Variable *T = makeReg(DestTy);
2829 _cvt(T, Src0R, Insts::Cvt::Dq2ps);
2830 _movp(Dest, T);
2831 } else if (Instr->getSrc(0)->getType() == IceType_i64) {
2832 llvm::report_fatal_error("Helper call was expected");
2833 } else {
2834 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2835 // Sign-extend the operand.
2836 // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
2837 Variable *T_1 = nullptr;
2838 assert(Src0RM->getType() != IceType_i64);
2839 T_1 = makeReg(IceType_i32);
2840 Variable *T_2 = makeReg(DestTy);
2841 if (Src0RM->getType() == T_1->getType())
2842 _mov(T_1, Src0RM);
2843 else
2844 _movsx(T_1, Src0RM);
2845 _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2846 _mov(Dest, T_2);
2847 }
2848 break;
2849 case InstCast::Uitofp: {
2850 Operand *Src0 = Instr->getSrc(0);
2851 if (isVectorType(Src0->getType())) {
2852 llvm::report_fatal_error("Helper call was expected");
2853 } else if (Src0->getType() == IceType_i64 ||
2854 Src0->getType() == IceType_i32) {
2855 llvm::report_fatal_error("Helper call was expected");
2856 } else {
2857 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2858 // Zero-extend the operand.
2859 // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
2860 Variable *T_1 = nullptr;
2861 assert(Src0RM->getType() != IceType_i64);
2862 assert(Src0RM->getType() != IceType_i32);
2863 T_1 = makeReg(IceType_i32);
2864 Variable *T_2 = makeReg(DestTy);
2865 if (Src0RM->getType() == T_1->getType())
2866 _mov(T_1, Src0RM);
2867 else
2868 _movzx(T_1, Src0RM)->setMustKeep();
2869 _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2870 _mov(Dest, T_2);
2871 }
2872 break;
2873 }
2874 case InstCast::Bitcast: {
2875 Operand *Src0 = Instr->getSrc(0);
2876 if (DestTy == Src0->getType()) {
2877 auto *Assign = InstAssign::create(Func, Dest, Src0);
2878 lowerAssign(Assign);
2879 return;
2880 }
2881 switch (DestTy) {
2882 default:
2883 llvm_unreachable("Unexpected Bitcast dest type");
2884 case IceType_i8: {
2885 llvm::report_fatal_error("Helper call was expected");
2886 } break;
2887 case IceType_i16: {
2888 llvm::report_fatal_error("Helper call was expected");
2889 } break;
2890 case IceType_i32:
2891 case IceType_f32: {
2892 Variable *Src0R = legalizeToReg(Src0);
2893 Variable *T = makeReg(DestTy);
2894 _movd(T, Src0R);
2895 _mov(Dest, T);
2896 } break;
2897 case IceType_i64: {
2898 assert(Src0->getType() == IceType_f64);
2899 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2900 // a.i64 = bitcast b.f64 ==>
2901 // s.f64 = spill b.f64
2902 // t_lo.i32 = lo(s.f64)
2903 // a_lo.i32 = t_lo.i32
2904 // t_hi.i32 = hi(s.f64)
2905 // a_hi.i32 = t_hi.i32
2906 Operand *SpillLo, *SpillHi;
2907 if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
2908 Variable *Spill = Func->makeVariable(IceType_f64);
2909 Spill->setLinkedTo(Src0Var);
2910 Spill->setMustNotHaveReg();
2911 _movq(Spill, Src0RM);
2912 SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
2913 SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
2914 } else {
2915 SpillLo = loOperand(Src0RM);
2916 SpillHi = hiOperand(Src0RM);
2917 }
2918
2919 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2920 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2921 Variable *T_Lo = makeReg(IceType_i32);
2922 Variable *T_Hi = makeReg(IceType_i32);
2923
2924 _mov(T_Lo, SpillLo);
2925 _mov(DestLo, T_Lo);
2926 _mov(T_Hi, SpillHi);
2927 _mov(DestHi, T_Hi);
2928 } break;
2929 case IceType_f64: {
2930 assert(Src0->getType() == IceType_i64);
2931 Src0 = legalize(Src0);
2932 if (llvm::isa<X86OperandMem>(Src0)) {
2933 Variable *T = makeReg(DestTy);
2934 _movq(T, Src0);
2935 _movq(Dest, T);
2936 break;
2937 }
2938 // a.f64 = bitcast b.i64 ==>
2939 // t_lo.i32 = b_lo.i32
2940 // FakeDef(s.f64)
2941 // lo(s.f64) = t_lo.i32
2942 // t_hi.i32 = b_hi.i32
2943 // hi(s.f64) = t_hi.i32
2944 // a.f64 = s.f64
2945 Variable *Spill = Func->makeVariable(IceType_f64);
2946 Spill->setLinkedTo(Dest);
2947 Spill->setMustNotHaveReg();
2948
2949 Variable *T_Lo = nullptr, *T_Hi = nullptr;
2950 auto *SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
2951 auto *SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
2952 _mov(T_Lo, loOperand(Src0));
2953 // Technically, the Spill is defined after the _store happens, but
2954 // SpillLo is considered a "use" of Spill so define Spill before it is
2955 // used.
2956 Context.insert<InstFakeDef>(Spill);
2957 _store(T_Lo, SpillLo);
2958 _mov(T_Hi, hiOperand(Src0));
2959 _store(T_Hi, SpillHi);
2960 _movq(Dest, Spill);
2961 } break;
2962 case IceType_v8i1: {
2963 llvm::report_fatal_error("Helper call was expected");
2964 } break;
2965 case IceType_v16i1: {
2966 llvm::report_fatal_error("Helper call was expected");
2967 } break;
2968 case IceType_v8i16:
2969 case IceType_v16i8:
2970 case IceType_v4i32:
2971 case IceType_v4f32: {
2972 if (Src0->getType() == IceType_i32) {
2973 // Bitcast requires equal type sizes, which isn't strictly the case
2974 // between scalars and vectors, but to emulate v4i8 vectors one has to
2975 // use v16i8 vectors.
2976 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2977 Variable *T = makeReg(DestTy);
2978 _movd(T, Src0RM);
2979 _mov(Dest, T);
2980 } else {
2981 _movp(Dest, legalizeToReg(Src0));
2982 }
2983 } break;
2984 }
2985 break;
2986 }
2987 }
2988 }
2989
lowerExtractElement(const InstExtractElement * Instr)2990 void TargetX8632::lowerExtractElement(const InstExtractElement *Instr) {
2991 Operand *SourceVectNotLegalized = Instr->getSrc(0);
2992 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
2993 // Only constant indices are allowed in PNaCl IR.
2994 assert(ElementIndex);
2995
2996 unsigned Index = ElementIndex->getValue();
2997 Type Ty = SourceVectNotLegalized->getType();
2998 Type ElementTy = typeElementType(Ty);
2999 Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
3000
3001 // TODO(wala): Determine the best lowering sequences for each type.
3002 bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3003 (InstructionSet >= SSE4_1 && Ty != IceType_v4f32);
3004 Variable *ExtractedElementR =
3005 makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
3006 if (CanUsePextr) {
3007 // Use pextrb, pextrw, or pextrd. The "b" and "w" versions clear the upper
3008 // bits of the destination register, so we represent this by always
3009 // extracting into an i32 register. The _mov into Dest below will do
3010 // truncation as necessary.
3011 Constant *Mask = Ctx->getConstantInt32(Index);
3012 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
3013 _pextr(ExtractedElementR, SourceVectR, Mask);
3014 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3015 // Use pshufd and movd/movss.
3016 Variable *T = nullptr;
3017 if (Index) {
3018 // The shuffle only needs to occur if the element to be extracted is not
3019 // at the lowest index.
3020 Constant *Mask = Ctx->getConstantInt32(Index);
3021 T = makeReg(Ty);
3022 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
3023 } else {
3024 T = legalizeToReg(SourceVectNotLegalized);
3025 }
3026
3027 if (InVectorElementTy == IceType_i32) {
3028 _movd(ExtractedElementR, T);
3029 } else { // Ty == IceType_f32
3030 // TODO(wala): _movss is only used here because _mov does not allow a
3031 // vector source and a scalar destination. _mov should be able to be
3032 // used here.
3033 // _movss is a binary instruction, so the FakeDef is needed to keep the
3034 // live range analysis consistent.
3035 Context.insert<InstFakeDef>(ExtractedElementR);
3036 _movss(ExtractedElementR, T);
3037 }
3038 } else {
3039 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3040 // Spill the value to a stack slot and do the extraction in memory.
3041 //
3042 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3043 // for legalizing to mem is implemented.
3044 Variable *Slot = Func->makeVariable(Ty);
3045 Slot->setMustNotHaveReg();
3046 _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3047
3048 // Compute the location of the element in memory.
3049 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3050 X86OperandMem *Loc =
3051 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3052 _mov(ExtractedElementR, Loc);
3053 }
3054
3055 if (ElementTy == IceType_i1) {
3056 // Truncate extracted integers to i1s if necessary.
3057 Variable *T = makeReg(IceType_i1);
3058 InstCast *Cast =
3059 InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
3060 lowerCast(Cast);
3061 ExtractedElementR = T;
3062 }
3063
3064 // Copy the element to the destination.
3065 Variable *Dest = Instr->getDest();
3066 _mov(Dest, ExtractedElementR);
3067 }
3068
lowerFcmp(const InstFcmp * Fcmp)3069 void TargetX8632::lowerFcmp(const InstFcmp *Fcmp) {
3070 Variable *Dest = Fcmp->getDest();
3071
3072 if (isVectorType(Dest->getType())) {
3073 lowerFcmpVector(Fcmp);
3074 } else {
3075 constexpr Inst *Consumer = nullptr;
3076 lowerFcmpAndConsumer(Fcmp, Consumer);
3077 }
3078 }
3079
lowerFcmpAndConsumer(const InstFcmp * Fcmp,const Inst * Consumer)3080 void TargetX8632::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
3081 const Inst *Consumer) {
3082 Operand *Src0 = Fcmp->getSrc(0);
3083 Operand *Src1 = Fcmp->getSrc(1);
3084 Variable *Dest = Fcmp->getDest();
3085
3086 if (Consumer != nullptr) {
3087 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3088 if (lowerOptimizeFcmpSelect(Fcmp, Select))
3089 return;
3090 }
3091 }
3092
3093 if (isVectorType(Dest->getType())) {
3094 lowerFcmp(Fcmp);
3095 if (Consumer != nullptr)
3096 lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3097 return;
3098 }
3099
3100 // Lowering a = fcmp cond, b, c
3101 // ucomiss b, c /* only if C1 != Br_None */
3102 // /* but swap b,c order if SwapOperands==true */
3103 // mov a, <default>
3104 // j<C1> label /* only if C1 != Br_None */
3105 // j<C2> label /* only if C2 != Br_None */
3106 // FakeUse(a) /* only if C1 != Br_None */
3107 // mov a, !<default> /* only if C1 != Br_None */
3108 // label: /* only if C1 != Br_None */
3109 //
3110 // setcc lowering when C1 != Br_None && C2 == Br_None:
3111 // ucomiss b, c /* but swap b,c order if SwapOperands==true */
3112 // setcc a, C1
3113 InstFcmp::FCond Condition = Fcmp->getCondition();
3114 assert(static_cast<size_t>(Condition) < TableFcmpSize);
3115 if (TableFcmp[Condition].SwapScalarOperands)
3116 std::swap(Src0, Src1);
3117 const bool HasC1 = (TableFcmp[Condition].C1 != CondX86::Br_None);
3118 const bool HasC2 = (TableFcmp[Condition].C2 != CondX86::Br_None);
3119 if (HasC1) {
3120 Src0 = legalize(Src0);
3121 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3122 Variable *T = nullptr;
3123 _mov(T, Src0);
3124 _ucomiss(T, Src1RM);
3125 if (!HasC2) {
3126 assert(TableFcmp[Condition].Default);
3127 setccOrConsumer(TableFcmp[Condition].C1, Dest, Consumer);
3128 return;
3129 }
3130 }
3131 int32_t IntDefault = TableFcmp[Condition].Default;
3132 if (Consumer == nullptr) {
3133 Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
3134 _mov(Dest, Default);
3135 if (HasC1) {
3136 InstX86Label *Label = InstX86Label::create(Func, this);
3137 _br(TableFcmp[Condition].C1, Label);
3138 if (HasC2) {
3139 _br(TableFcmp[Condition].C2, Label);
3140 }
3141 Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
3142 _redefined(_mov(Dest, NonDefault));
3143 Context.insert(Label);
3144 }
3145 return;
3146 }
3147 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3148 CfgNode *TrueSucc = Br->getTargetTrue();
3149 CfgNode *FalseSucc = Br->getTargetFalse();
3150 if (IntDefault != 0)
3151 std::swap(TrueSucc, FalseSucc);
3152 if (HasC1) {
3153 _br(TableFcmp[Condition].C1, FalseSucc);
3154 if (HasC2) {
3155 _br(TableFcmp[Condition].C2, FalseSucc);
3156 }
3157 _br(TrueSucc);
3158 return;
3159 }
3160 _br(FalseSucc);
3161 return;
3162 }
3163 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3164 Operand *SrcT = Select->getTrueOperand();
3165 Operand *SrcF = Select->getFalseOperand();
3166 Variable *SelectDest = Select->getDest();
3167 if (IntDefault != 0)
3168 std::swap(SrcT, SrcF);
3169 lowerMove(SelectDest, SrcF, false);
3170 if (HasC1) {
3171 InstX86Label *Label = InstX86Label::create(Func, this);
3172 _br(TableFcmp[Condition].C1, Label);
3173 if (HasC2) {
3174 _br(TableFcmp[Condition].C2, Label);
3175 }
3176 static constexpr bool IsRedefinition = true;
3177 lowerMove(SelectDest, SrcT, IsRedefinition);
3178 Context.insert(Label);
3179 }
3180 return;
3181 }
3182 llvm::report_fatal_error("Unexpected consumer type");
3183 }
3184
lowerFcmpVector(const InstFcmp * Fcmp)3185 void TargetX8632::lowerFcmpVector(const InstFcmp *Fcmp) {
3186 Operand *Src0 = Fcmp->getSrc(0);
3187 Operand *Src1 = Fcmp->getSrc(1);
3188 Variable *Dest = Fcmp->getDest();
3189
3190 if (!isVectorType(Dest->getType()))
3191 llvm::report_fatal_error("Expected vector compare");
3192
3193 InstFcmp::FCond Condition = Fcmp->getCondition();
3194 assert(static_cast<size_t>(Condition) < TableFcmpSize);
3195
3196 if (TableFcmp[Condition].SwapVectorOperands)
3197 std::swap(Src0, Src1);
3198
3199 Variable *T = nullptr;
3200
3201 if (Condition == InstFcmp::True) {
3202 // makeVectorOfOnes() requires an integer vector type.
3203 T = makeVectorOfMinusOnes(IceType_v4i32);
3204 } else if (Condition == InstFcmp::False) {
3205 T = makeVectorOfZeros(Dest->getType());
3206 } else {
3207 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3208 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3209 if (llvm::isa<X86OperandMem>(Src1RM))
3210 Src1RM = legalizeToReg(Src1RM);
3211
3212 switch (Condition) {
3213 default: {
3214 const CmppsCond Predicate = TableFcmp[Condition].Predicate;
3215 assert(Predicate != CondX86::Cmpps_Invalid);
3216 T = makeReg(Src0RM->getType());
3217 _movp(T, Src0RM);
3218 _cmpps(T, Src1RM, Predicate);
3219 } break;
3220 case InstFcmp::One: {
3221 // Check both unequal and ordered.
3222 T = makeReg(Src0RM->getType());
3223 Variable *T2 = makeReg(Src0RM->getType());
3224 _movp(T, Src0RM);
3225 _cmpps(T, Src1RM, CondX86::Cmpps_neq);
3226 _movp(T2, Src0RM);
3227 _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
3228 _pand(T, T2);
3229 } break;
3230 case InstFcmp::Ueq: {
3231 // Check both equal or unordered.
3232 T = makeReg(Src0RM->getType());
3233 Variable *T2 = makeReg(Src0RM->getType());
3234 _movp(T, Src0RM);
3235 _cmpps(T, Src1RM, CondX86::Cmpps_eq);
3236 _movp(T2, Src0RM);
3237 _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
3238 _por(T, T2);
3239 } break;
3240 }
3241 }
3242
3243 assert(T != nullptr);
3244 _movp(Dest, T);
3245 eliminateNextVectorSextInstruction(Dest);
3246 }
3247
isZero(const Operand * Opnd)3248 inline bool isZero(const Operand *Opnd) {
3249 if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
3250 return C64->getValue() == 0;
3251 if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
3252 return C32->getValue() == 0;
3253 return false;
3254 }
3255
lowerIcmpAndConsumer(const InstIcmp * Icmp,const Inst * Consumer)3256 void TargetX8632::lowerIcmpAndConsumer(const InstIcmp *Icmp,
3257 const Inst *Consumer) {
3258 Operand *Src0 = legalize(Icmp->getSrc(0));
3259 Operand *Src1 = legalize(Icmp->getSrc(1));
3260 Variable *Dest = Icmp->getDest();
3261
3262 if (isVectorType(Dest->getType())) {
3263 lowerIcmp(Icmp);
3264 if (Consumer != nullptr)
3265 lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3266 return;
3267 }
3268
3269 if (Src0->getType() == IceType_i64) {
3270 lowerIcmp64(Icmp, Consumer);
3271 return;
3272 }
3273
3274 // cmp b, c
3275 if (isZero(Src1)) {
3276 switch (Icmp->getCondition()) {
3277 default:
3278 break;
3279 case InstIcmp::Uge:
3280 movOrConsumer(true, Dest, Consumer);
3281 return;
3282 case InstIcmp::Ult:
3283 movOrConsumer(false, Dest, Consumer);
3284 return;
3285 }
3286 }
3287 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
3288 _cmp(Src0RM, Src1);
3289 setccOrConsumer(getIcmp32Mapping(Icmp->getCondition()), Dest, Consumer);
3290 }
3291
lowerIcmpVector(const InstIcmp * Icmp)3292 void TargetX8632::lowerIcmpVector(const InstIcmp *Icmp) {
3293 Operand *Src0 = legalize(Icmp->getSrc(0));
3294 Operand *Src1 = legalize(Icmp->getSrc(1));
3295 Variable *Dest = Icmp->getDest();
3296
3297 if (!isVectorType(Dest->getType()))
3298 llvm::report_fatal_error("Expected a vector compare");
3299
3300 Type Ty = Src0->getType();
3301 // Promote i1 vectors to 128 bit integer vector types.
3302 if (typeElementType(Ty) == IceType_i1) {
3303 Type NewTy = IceType_NUM;
3304 switch (Ty) {
3305 default:
3306 llvm::report_fatal_error("unexpected type");
3307 break;
3308 case IceType_v4i1:
3309 NewTy = IceType_v4i32;
3310 break;
3311 case IceType_v8i1:
3312 NewTy = IceType_v8i16;
3313 break;
3314 case IceType_v16i1:
3315 NewTy = IceType_v16i8;
3316 break;
3317 }
3318 Variable *NewSrc0 = Func->makeVariable(NewTy);
3319 Variable *NewSrc1 = Func->makeVariable(NewTy);
3320 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3321 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3322 Src0 = NewSrc0;
3323 Src1 = NewSrc1;
3324 Ty = NewTy;
3325 }
3326
3327 InstIcmp::ICond Condition = Icmp->getCondition();
3328
3329 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3330 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3331
3332 // SSE2 only has signed comparison operations. Transform unsigned inputs in
3333 // a manner that allows for the use of signed comparison operations by
3334 // flipping the high order bits.
3335 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3336 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3337 Variable *T0 = makeReg(Ty);
3338 Variable *T1 = makeReg(Ty);
3339 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3340 _movp(T0, Src0RM);
3341 _pxor(T0, HighOrderBits);
3342 _movp(T1, Src1RM);
3343 _pxor(T1, HighOrderBits);
3344 Src0RM = T0;
3345 Src1RM = T1;
3346 }
3347
3348 Variable *T = makeReg(Ty);
3349 switch (Condition) {
3350 default:
3351 llvm_unreachable("unexpected condition");
3352 break;
3353 case InstIcmp::Eq: {
3354 if (llvm::isa<X86OperandMem>(Src1RM))
3355 Src1RM = legalizeToReg(Src1RM);
3356 _movp(T, Src0RM);
3357 _pcmpeq(T, Src1RM);
3358 } break;
3359 case InstIcmp::Ne: {
3360 if (llvm::isa<X86OperandMem>(Src1RM))
3361 Src1RM = legalizeToReg(Src1RM);
3362 _movp(T, Src0RM);
3363 _pcmpeq(T, Src1RM);
3364 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3365 _pxor(T, MinusOne);
3366 } break;
3367 case InstIcmp::Ugt:
3368 case InstIcmp::Sgt: {
3369 if (llvm::isa<X86OperandMem>(Src1RM))
3370 Src1RM = legalizeToReg(Src1RM);
3371 _movp(T, Src0RM);
3372 _pcmpgt(T, Src1RM);
3373 } break;
3374 case InstIcmp::Uge:
3375 case InstIcmp::Sge: {
3376 // !(Src1RM > Src0RM)
3377 if (llvm::isa<X86OperandMem>(Src0RM))
3378 Src0RM = legalizeToReg(Src0RM);
3379 _movp(T, Src1RM);
3380 _pcmpgt(T, Src0RM);
3381 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3382 _pxor(T, MinusOne);
3383 } break;
3384 case InstIcmp::Ult:
3385 case InstIcmp::Slt: {
3386 if (llvm::isa<X86OperandMem>(Src0RM))
3387 Src0RM = legalizeToReg(Src0RM);
3388 _movp(T, Src1RM);
3389 _pcmpgt(T, Src0RM);
3390 } break;
3391 case InstIcmp::Ule:
3392 case InstIcmp::Sle: {
3393 // !(Src0RM > Src1RM)
3394 if (llvm::isa<X86OperandMem>(Src1RM))
3395 Src1RM = legalizeToReg(Src1RM);
3396 _movp(T, Src0RM);
3397 _pcmpgt(T, Src1RM);
3398 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3399 _pxor(T, MinusOne);
3400 } break;
3401 }
3402
3403 _movp(Dest, T);
3404 eliminateNextVectorSextInstruction(Dest);
3405 }
3406
lowerIcmp64(const InstIcmp * Icmp,const Inst * Consumer)3407 void TargetX8632::lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer) {
3408 // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
3409 Operand *Src0 = legalize(Icmp->getSrc(0));
3410 Operand *Src1 = legalize(Icmp->getSrc(1));
3411 Variable *Dest = Icmp->getDest();
3412 InstIcmp::ICond Condition = Icmp->getCondition();
3413 assert(static_cast<size_t>(Condition) < TableIcmp64Size);
3414 Operand *Src0LoRM = nullptr;
3415 Operand *Src0HiRM = nullptr;
3416 // Legalize the portions of Src0 that are going to be needed.
3417 if (isZero(Src1)) {
3418 switch (Condition) {
3419 default:
3420 llvm_unreachable("unexpected condition");
3421 break;
3422 // These two are not optimized, so we fall through to the general case,
3423 // which needs the upper and lower halves legalized.
3424 case InstIcmp::Sgt:
3425 case InstIcmp::Sle:
3426 // These four compare after performing an "or" of the high and low half, so
3427 // they need the upper and lower halves legalized.
3428 case InstIcmp::Eq:
3429 case InstIcmp::Ule:
3430 case InstIcmp::Ne:
3431 case InstIcmp::Ugt:
3432 Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3433 // These two test only the high half's sign bit, so they need only
3434 // the upper half legalized.
3435 case InstIcmp::Sge:
3436 case InstIcmp::Slt:
3437 Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3438 break;
3439
3440 // These two move constants and hence need no legalization.
3441 case InstIcmp::Uge:
3442 case InstIcmp::Ult:
3443 break;
3444 }
3445 } else {
3446 Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3447 Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3448 }
3449 // Optimize comparisons with zero.
3450 if (isZero(Src1)) {
3451 Constant *SignMask = Ctx->getConstantInt32(0x80000000);
3452 Variable *Temp = nullptr;
3453 switch (Condition) {
3454 default:
3455 llvm_unreachable("unexpected condition");
3456 break;
3457 case InstIcmp::Eq:
3458 case InstIcmp::Ule:
3459 // Mov Src0HiRM first, because it was legalized most recently, and will
3460 // sometimes avoid a move before the OR.
3461 _mov(Temp, Src0HiRM);
3462 _or(Temp, Src0LoRM);
3463 Context.insert<InstFakeUse>(Temp);
3464 setccOrConsumer(CondX86::Br_e, Dest, Consumer);
3465 return;
3466 case InstIcmp::Ne:
3467 case InstIcmp::Ugt:
3468 // Mov Src0HiRM first, because it was legalized most recently, and will
3469 // sometimes avoid a move before the OR.
3470 _mov(Temp, Src0HiRM);
3471 _or(Temp, Src0LoRM);
3472 Context.insert<InstFakeUse>(Temp);
3473 setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
3474 return;
3475 case InstIcmp::Uge:
3476 movOrConsumer(true, Dest, Consumer);
3477 return;
3478 case InstIcmp::Ult:
3479 movOrConsumer(false, Dest, Consumer);
3480 return;
3481 case InstIcmp::Sgt:
3482 break;
3483 case InstIcmp::Sge:
3484 _test(Src0HiRM, SignMask);
3485 setccOrConsumer(CondX86::Br_e, Dest, Consumer);
3486 return;
3487 case InstIcmp::Slt:
3488 _test(Src0HiRM, SignMask);
3489 setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
3490 return;
3491 case InstIcmp::Sle:
3492 break;
3493 }
3494 }
3495 // Handle general compares.
3496 Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
3497 Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
3498 if (Consumer == nullptr) {
3499 Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
3500 Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
3501 InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3502 InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3503 _mov(Dest, One);
3504 _cmp(Src0HiRM, Src1HiRI);
3505 if (TableIcmp64[Condition].C1 != CondX86::Br_None)
3506 _br(TableIcmp64[Condition].C1, LabelTrue);
3507 if (TableIcmp64[Condition].C2 != CondX86::Br_None)
3508 _br(TableIcmp64[Condition].C2, LabelFalse);
3509 _cmp(Src0LoRM, Src1LoRI);
3510 _br(TableIcmp64[Condition].C3, LabelTrue);
3511 Context.insert(LabelFalse);
3512 _redefined(_mov(Dest, Zero));
3513 Context.insert(LabelTrue);
3514 return;
3515 }
3516 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3517 _cmp(Src0HiRM, Src1HiRI);
3518 if (TableIcmp64[Condition].C1 != CondX86::Br_None)
3519 _br(TableIcmp64[Condition].C1, Br->getTargetTrue());
3520 if (TableIcmp64[Condition].C2 != CondX86::Br_None)
3521 _br(TableIcmp64[Condition].C2, Br->getTargetFalse());
3522 _cmp(Src0LoRM, Src1LoRI);
3523 _br(TableIcmp64[Condition].C3, Br->getTargetTrue(), Br->getTargetFalse());
3524 return;
3525 }
3526 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3527 Operand *SrcT = Select->getTrueOperand();
3528 Operand *SrcF = Select->getFalseOperand();
3529 Variable *SelectDest = Select->getDest();
3530 InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3531 InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3532 lowerMove(SelectDest, SrcT, false);
3533 _cmp(Src0HiRM, Src1HiRI);
3534 if (TableIcmp64[Condition].C1 != CondX86::Br_None)
3535 _br(TableIcmp64[Condition].C1, LabelTrue);
3536 if (TableIcmp64[Condition].C2 != CondX86::Br_None)
3537 _br(TableIcmp64[Condition].C2, LabelFalse);
3538 _cmp(Src0LoRM, Src1LoRI);
3539 _br(TableIcmp64[Condition].C3, LabelTrue);
3540 Context.insert(LabelFalse);
3541 static constexpr bool IsRedefinition = true;
3542 lowerMove(SelectDest, SrcF, IsRedefinition);
3543 Context.insert(LabelTrue);
3544 return;
3545 }
3546 llvm::report_fatal_error("Unexpected consumer type");
3547 }
3548
setccOrConsumer(BrCond Condition,Variable * Dest,const Inst * Consumer)3549 void TargetX8632::setccOrConsumer(BrCond Condition, Variable *Dest,
3550 const Inst *Consumer) {
3551 if (Consumer == nullptr) {
3552 _setcc(Dest, Condition);
3553 return;
3554 }
3555 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3556 _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3557 return;
3558 }
3559 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3560 Operand *SrcT = Select->getTrueOperand();
3561 Operand *SrcF = Select->getFalseOperand();
3562 Variable *SelectDest = Select->getDest();
3563 lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3564 return;
3565 }
3566 llvm::report_fatal_error("Unexpected consumer type");
3567 }
3568
movOrConsumer(bool IcmpResult,Variable * Dest,const Inst * Consumer)3569 void TargetX8632::movOrConsumer(bool IcmpResult, Variable *Dest,
3570 const Inst *Consumer) {
3571 if (Consumer == nullptr) {
3572 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3573 return;
3574 }
3575 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3576 // TODO(sehr,stichnot): This could be done with a single unconditional
3577 // branch instruction, but subzero doesn't know how to handle the resulting
3578 // control flow graph changes now. Make it do so to eliminate mov and cmp.
3579 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3580 _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3581 _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3582 return;
3583 }
3584 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3585 Operand *Src = nullptr;
3586 if (IcmpResult) {
3587 Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3588 } else {
3589 Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3590 }
3591 Variable *SelectDest = Select->getDest();
3592 lowerMove(SelectDest, Src, false);
3593 return;
3594 }
3595 llvm::report_fatal_error("Unexpected consumer type");
3596 }
3597
lowerArithAndConsumer(const InstArithmetic * Arith,const Inst * Consumer)3598 void TargetX8632::lowerArithAndConsumer(const InstArithmetic *Arith,
3599 const Inst *Consumer) {
3600 Variable *T = nullptr;
3601 Operand *Src0 = legalize(Arith->getSrc(0));
3602 Operand *Src1 = legalize(Arith->getSrc(1));
3603 Variable *Dest = Arith->getDest();
3604 switch (Arith->getOp()) {
3605 default:
3606 llvm_unreachable("arithmetic operator not AND or OR");
3607 break;
3608 case InstArithmetic::And:
3609 _mov(T, Src0);
3610 // Test cannot have an address in the second position. Since T is
3611 // guaranteed to be a register and Src1 could be a memory load, ensure
3612 // that the second argument is a register.
3613 if (llvm::isa<Constant>(Src1))
3614 _test(T, Src1);
3615 else
3616 _test(Src1, T);
3617 break;
3618 case InstArithmetic::Or:
3619 _mov(T, Src0);
3620 _or(T, Src1);
3621 break;
3622 }
3623
3624 if (Consumer == nullptr) {
3625 llvm::report_fatal_error("Expected a consumer instruction");
3626 }
3627 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3628 Context.insert<InstFakeUse>(T);
3629 Context.insert<InstFakeDef>(Dest);
3630 _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3631 return;
3632 }
3633 llvm::report_fatal_error("Unexpected consumer type");
3634 }
3635
lowerInsertElement(const InstInsertElement * Instr)3636 void TargetX8632::lowerInsertElement(const InstInsertElement *Instr) {
3637 Operand *SourceVectNotLegalized = Instr->getSrc(0);
3638 Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3639 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3640 // Only constant indices are allowed in PNaCl IR.
3641 assert(ElementIndex);
3642 unsigned Index = ElementIndex->getValue();
3643 assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3644
3645 Type Ty = SourceVectNotLegalized->getType();
3646 Type ElementTy = typeElementType(Ty);
3647 Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
3648
3649 if (ElementTy == IceType_i1) {
3650 // Expand the element to the appropriate size for it to be inserted in the
3651 // vector.
3652 Variable *Expanded = Func->makeVariable(InVectorElementTy);
3653 auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3654 ElementToInsertNotLegalized);
3655 lowerCast(Cast);
3656 ElementToInsertNotLegalized = Expanded;
3657 }
3658
3659 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
3660 // Use insertps, pinsrb, pinsrw, or pinsrd.
3661 Operand *ElementRM =
3662 legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3663 Operand *SourceVectRM =
3664 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3665 Variable *T = makeReg(Ty);
3666 _movp(T, SourceVectRM);
3667 if (Ty == IceType_v4f32) {
3668 _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
3669 } else {
3670 // For the pinsrb and pinsrw instructions, when the source operand is a
3671 // register, it must be a full r32 register like eax, and not ax/al/ah.
3672 // For filetype=asm, InstX86Pinsr::emit() compensates for
3673 // the use
3674 // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
3675 // validates that the original and base register encodings are the same.
3676 if (ElementRM->getType() == IceType_i8 &&
3677 llvm::isa<Variable>(ElementRM)) {
3678 // Don't use ah/bh/ch/dh for pinsrb.
3679 ElementRM = copyToReg8(ElementRM);
3680 }
3681 _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
3682 }
3683 _movp(Instr->getDest(), T);
3684 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3685 // Use shufps or movss.
3686 Variable *ElementR = nullptr;
3687 Operand *SourceVectRM =
3688 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3689
3690 if (InVectorElementTy == IceType_f32) {
3691 // ElementR will be in an XMM register since it is floating point.
3692 ElementR = legalizeToReg(ElementToInsertNotLegalized);
3693 } else {
3694 // Copy an integer to an XMM register.
3695 Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3696 ElementR = makeReg(Ty);
3697 _movd(ElementR, T);
3698 }
3699
3700 if (Index == 0) {
3701 Variable *T = makeReg(Ty);
3702 _movp(T, SourceVectRM);
3703 _movss(T, ElementR);
3704 _movp(Instr->getDest(), T);
3705 return;
3706 }
3707
3708 // shufps treats the source and destination operands as vectors of four
3709 // doublewords. The destination's two high doublewords are selected from
3710 // the source operand and the two low doublewords are selected from the
3711 // (original value of) the destination operand. An insertelement operation
3712 // can be effected with a sequence of two shufps operations with
3713 // appropriate masks. In all cases below, Element[0] is being inserted into
3714 // SourceVectOperand. Indices are ordered from left to right.
3715 //
3716 // insertelement into index 1 (result is stored in ElementR):
3717 // ElementR := ElementR[0, 0] SourceVectRM[0, 0]
3718 // ElementR := ElementR[3, 0] SourceVectRM[2, 3]
3719 //
3720 // insertelement into index 2 (result is stored in T):
3721 // T := SourceVectRM
3722 // ElementR := ElementR[0, 0] T[0, 3]
3723 // T := T[0, 1] ElementR[0, 3]
3724 //
3725 // insertelement into index 3 (result is stored in T):
3726 // T := SourceVectRM
3727 // ElementR := ElementR[0, 0] T[0, 2]
3728 // T := T[0, 1] ElementR[3, 0]
3729 const unsigned char Mask1[3] = {0, 192, 128};
3730 const unsigned char Mask2[3] = {227, 196, 52};
3731
3732 Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
3733 Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
3734
3735 if (Index == 1) {
3736 _shufps(ElementR, SourceVectRM, Mask1Constant);
3737 _shufps(ElementR, SourceVectRM, Mask2Constant);
3738 _movp(Instr->getDest(), ElementR);
3739 } else {
3740 Variable *T = makeReg(Ty);
3741 _movp(T, SourceVectRM);
3742 _shufps(ElementR, T, Mask1Constant);
3743 _shufps(T, ElementR, Mask2Constant);
3744 _movp(Instr->getDest(), T);
3745 }
3746 } else {
3747 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3748 // Spill the value to a stack slot and perform the insertion in memory.
3749 //
3750 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3751 // for legalizing to mem is implemented.
3752 Variable *Slot = Func->makeVariable(Ty);
3753 Slot->setMustNotHaveReg();
3754 _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3755
3756 // Compute the location of the position to insert in memory.
3757 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3758 X86OperandMem *Loc =
3759 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3760 _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
3761
3762 Variable *T = makeReg(Ty);
3763 _movp(T, Slot);
3764 _movp(Instr->getDest(), T);
3765 }
3766 }
3767
lowerIntrinsic(const InstIntrinsic * Instr)3768 void TargetX8632::lowerIntrinsic(const InstIntrinsic *Instr) {
3769 switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
3770 case Intrinsics::AtomicCmpxchg: {
3771 if (!Intrinsics::isMemoryOrderValid(
3772 ID, getConstantMemoryOrder(Instr->getArg(3)),
3773 getConstantMemoryOrder(Instr->getArg(4)))) {
3774 Func->setError("Unexpected memory ordering for AtomicCmpxchg");
3775 return;
3776 }
3777 Variable *DestPrev = Instr->getDest();
3778 Operand *PtrToMem = legalize(Instr->getArg(0));
3779 Operand *Expected = legalize(Instr->getArg(1));
3780 Operand *Desired = legalize(Instr->getArg(2));
3781 if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
3782 return;
3783 lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
3784 return;
3785 }
3786 case Intrinsics::AtomicFence:
3787 if (!Intrinsics::isMemoryOrderValid(
3788 ID, getConstantMemoryOrder(Instr->getArg(0)))) {
3789 Func->setError("Unexpected memory ordering for AtomicFence");
3790 return;
3791 }
3792 _mfence();
3793 return;
3794 case Intrinsics::AtomicFenceAll:
3795 // NOTE: FenceAll should prevent and load/store from being moved across the
3796 // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
3797 // currently marked coarsely as "HasSideEffects".
3798 _mfence();
3799 return;
3800 case Intrinsics::AtomicIsLockFree: {
3801 // X86 is always lock free for 8/16/32/64 bit accesses.
3802 // TODO(jvoung): Since the result is constant when given a constant byte
3803 // size, this opens up DCE opportunities.
3804 Operand *ByteSize = Instr->getArg(0);
3805 Variable *Dest = Instr->getDest();
3806 if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
3807 Constant *Result;
3808 switch (CI->getValue()) {
3809 default:
3810 // Some x86-64 processors support the cmpxchg16b instruction, which can
3811 // make 16-byte operations lock free (when used with the LOCK prefix).
3812 // However, that's not supported in 32-bit mode, so just return 0 even
3813 // for large sizes.
3814 Result = Ctx->getConstantZero(IceType_i32);
3815 break;
3816 case 1:
3817 case 2:
3818 case 4:
3819 case 8:
3820 Result = Ctx->getConstantInt32(1);
3821 break;
3822 }
3823 _mov(Dest, Result);
3824 return;
3825 }
3826 // The PNaCl ABI requires the byte size to be a compile-time constant.
3827 Func->setError("AtomicIsLockFree byte size should be compile-time const");
3828 return;
3829 }
3830 case Intrinsics::AtomicLoad: {
3831 // We require the memory address to be naturally aligned. Given that is the
3832 // case, then normal loads are atomic.
3833 if (!Intrinsics::isMemoryOrderValid(
3834 ID, getConstantMemoryOrder(Instr->getArg(1)))) {
3835 Func->setError("Unexpected memory ordering for AtomicLoad");
3836 return;
3837 }
3838 Variable *Dest = Instr->getDest();
3839 if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
3840 // Follow what GCC does and use a movq instead of what lowerLoad()
3841 // normally does (split the load into two). Thus, this skips
3842 // load/arithmetic op folding. Load/arithmetic folding can't happen
3843 // anyway, since this is x86-32 and integer arithmetic only happens on
3844 // 32-bit quantities.
3845 Variable *T = makeReg(IceType_f64);
3846 X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
3847 _movq(T, Addr);
3848 // Then cast the bits back out of the XMM register to the i64 Dest.
3849 auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
3850 lowerCast(Cast);
3851 // Make sure that the atomic load isn't elided when unused.
3852 Context.insert<InstFakeUse>(Dest64On32->getLo());
3853 Context.insert<InstFakeUse>(Dest64On32->getHi());
3854 return;
3855 }
3856 auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
3857 lowerLoad(Load);
3858 // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
3859 // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
3860 // the FakeUse on the last-inserted instruction's dest.
3861 Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
3862 return;
3863 }
3864 case Intrinsics::AtomicRMW:
3865 if (!Intrinsics::isMemoryOrderValid(
3866 ID, getConstantMemoryOrder(Instr->getArg(3)))) {
3867 Func->setError("Unexpected memory ordering for AtomicRMW");
3868 return;
3869 }
3870 lowerAtomicRMW(
3871 Instr->getDest(),
3872 static_cast<uint32_t>(
3873 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
3874 Instr->getArg(1), Instr->getArg(2));
3875 return;
3876 case Intrinsics::AtomicStore: {
3877 if (!Intrinsics::isMemoryOrderValid(
3878 ID, getConstantMemoryOrder(Instr->getArg(2)))) {
3879 Func->setError("Unexpected memory ordering for AtomicStore");
3880 return;
3881 }
3882 // We require the memory address to be naturally aligned. Given that is the
3883 // case, then normal stores are atomic. Add a fence after the store to make
3884 // it visible.
3885 Operand *Value = Instr->getArg(0);
3886 Operand *Ptr = Instr->getArg(1);
3887 if (Value->getType() == IceType_i64) {
3888 // Use a movq instead of what lowerStore() normally does (split the store
3889 // into two), following what GCC does. Cast the bits from int -> to an
3890 // xmm register first.
3891 Variable *T = makeReg(IceType_f64);
3892 auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
3893 lowerCast(Cast);
3894 // Then store XMM w/ a movq.
3895 X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
3896 _storeq(T, Addr);
3897 _mfence();
3898 return;
3899 }
3900 auto *Store = InstStore::create(Func, Value, Ptr);
3901 lowerStore(Store);
3902 _mfence();
3903 return;
3904 }
3905 case Intrinsics::Bswap: {
3906 Variable *Dest = Instr->getDest();
3907 Operand *Val = Instr->getArg(0);
3908 // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
3909 // must be a register. Use rotate left for 16-bit bswap.
3910 if (Val->getType() == IceType_i64) {
3911 Val = legalizeUndef(Val);
3912 Variable *T_Lo = legalizeToReg(loOperand(Val));
3913 Variable *T_Hi = legalizeToReg(hiOperand(Val));
3914 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
3915 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3916 _bswap(T_Lo);
3917 _bswap(T_Hi);
3918 _mov(DestLo, T_Hi);
3919 _mov(DestHi, T_Lo);
3920 } else if (Val->getType() == IceType_i32) {
3921 Variable *T = legalizeToReg(Val);
3922 _bswap(T);
3923 _mov(Dest, T);
3924 } else {
3925 assert(Val->getType() == IceType_i16);
3926 Constant *Eight = Ctx->getConstantInt16(8);
3927 Variable *T = nullptr;
3928 Val = legalize(Val);
3929 _mov(T, Val);
3930 _rol(T, Eight);
3931 _mov(Dest, T);
3932 }
3933 return;
3934 }
3935 case Intrinsics::Ctpop: {
3936 Variable *Dest = Instr->getDest();
3937 Operand *Val = Instr->getArg(0);
3938 Type ValTy = Val->getType();
3939 assert(ValTy == IceType_i32 || ValTy == IceType_i64);
3940
3941 InstCall *Call =
3942 makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
3943 : RuntimeHelper::H_call_ctpop_i64,
3944 Dest, 1);
3945 Call->addArg(Val);
3946 lowerCall(Call);
3947 // The popcount helpers always return 32-bit values, while the intrinsic's
3948 // signature matches the native POPCNT instruction and fills a 64-bit reg
3949 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
3950 // the user doesn't do that in the IR. If the user does that in the IR,
3951 // then this zero'ing instruction is dead and gets optimized out.
3952 if (Val->getType() == IceType_i64) {
3953 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3954 Constant *Zero = Ctx->getConstantZero(IceType_i32);
3955 _mov(DestHi, Zero);
3956 }
3957 return;
3958 }
3959 case Intrinsics::Ctlz: {
3960 // The "is zero undef" parameter is ignored and we always return a
3961 // well-defined value.
3962 Operand *Val = legalize(Instr->getArg(0));
3963 Operand *FirstVal;
3964 Operand *SecondVal = nullptr;
3965 if (Val->getType() == IceType_i64) {
3966 FirstVal = loOperand(Val);
3967 SecondVal = hiOperand(Val);
3968 } else {
3969 FirstVal = Val;
3970 }
3971 constexpr bool IsCttz = false;
3972 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3973 SecondVal);
3974 return;
3975 }
3976 case Intrinsics::Cttz: {
3977 // The "is zero undef" parameter is ignored and we always return a
3978 // well-defined value.
3979 Operand *Val = legalize(Instr->getArg(0));
3980 Operand *FirstVal;
3981 Operand *SecondVal = nullptr;
3982 if (Val->getType() == IceType_i64) {
3983 FirstVal = hiOperand(Val);
3984 SecondVal = loOperand(Val);
3985 } else {
3986 FirstVal = Val;
3987 }
3988 constexpr bool IsCttz = true;
3989 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3990 SecondVal);
3991 return;
3992 }
3993 case Intrinsics::Fabs: {
3994 Operand *Src = legalize(Instr->getArg(0));
3995 Type Ty = Src->getType();
3996 Variable *Dest = Instr->getDest();
3997 Variable *T = makeVectorOfFabsMask(Ty);
3998 // The pand instruction operates on an m128 memory operand, so if Src is an
3999 // f32 or f64, we need to make sure it's in a register.
4000 if (isVectorType(Ty)) {
4001 if (llvm::isa<X86OperandMem>(Src))
4002 Src = legalizeToReg(Src);
4003 } else {
4004 Src = legalizeToReg(Src);
4005 }
4006 _pand(T, Src);
4007 if (isVectorType(Ty))
4008 _movp(Dest, T);
4009 else
4010 _mov(Dest, T);
4011 return;
4012 }
4013 case Intrinsics::Longjmp: {
4014 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
4015 Call->addArg(Instr->getArg(0));
4016 Call->addArg(Instr->getArg(1));
4017 lowerCall(Call);
4018 return;
4019 }
4020 case Intrinsics::Memcpy: {
4021 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4022 return;
4023 }
4024 case Intrinsics::Memmove: {
4025 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4026 return;
4027 }
4028 case Intrinsics::Memset: {
4029 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4030 return;
4031 }
4032 case Intrinsics::Setjmp: {
4033 InstCall *Call =
4034 makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
4035 Call->addArg(Instr->getArg(0));
4036 lowerCall(Call);
4037 return;
4038 }
4039 case Intrinsics::Sqrt: {
4040 Operand *Src = legalize(Instr->getArg(0));
4041 Variable *Dest = Instr->getDest();
4042 Variable *T = makeReg(Dest->getType());
4043 _sqrt(T, Src);
4044 if (isVectorType(Dest->getType())) {
4045 _movp(Dest, T);
4046 } else {
4047 _mov(Dest, T);
4048 }
4049 return;
4050 }
4051 case Intrinsics::Stacksave: {
4052 Variable *esp =
4053 Func->getTarget()->getPhysicalRegister(getStackReg(), WordType);
4054 Variable *Dest = Instr->getDest();
4055 _mov(Dest, esp);
4056 return;
4057 }
4058 case Intrinsics::Stackrestore: {
4059 Operand *Src = Instr->getArg(0);
4060 _mov_sp(Src);
4061 return;
4062 }
4063
4064 case Intrinsics::Trap:
4065 _ud2();
4066 return;
4067 case Intrinsics::LoadSubVector: {
4068 assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
4069 "LoadSubVector second argument must be a constant");
4070 Variable *Dest = Instr->getDest();
4071 Type Ty = Dest->getType();
4072 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
4073 Operand *Addr = Instr->getArg(0);
4074 X86OperandMem *Src = formMemoryOperand(Addr, Ty);
4075 doMockBoundsCheck(Src);
4076
4077 if (Dest->isRematerializable()) {
4078 Context.insert<InstFakeDef>(Dest);
4079 return;
4080 }
4081
4082 auto *T = makeReg(Ty);
4083 switch (SubVectorSize->getValue()) {
4084 case 4:
4085 _movd(T, Src);
4086 break;
4087 case 8:
4088 _movq(T, Src);
4089 break;
4090 default:
4091 Func->setError("Unexpected size for LoadSubVector");
4092 return;
4093 }
4094 _movp(Dest, T);
4095 return;
4096 }
4097 case Intrinsics::StoreSubVector: {
4098 assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
4099 "StoreSubVector third argument must be a constant");
4100 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
4101 Operand *Value = Instr->getArg(0);
4102 Operand *Addr = Instr->getArg(1);
4103 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
4104 doMockBoundsCheck(NewAddr);
4105
4106 Value = legalizeToReg(Value);
4107
4108 switch (SubVectorSize->getValue()) {
4109 case 4:
4110 _stored(Value, NewAddr);
4111 break;
4112 case 8:
4113 _storeq(Value, NewAddr);
4114 break;
4115 default:
4116 Func->setError("Unexpected size for StoreSubVector");
4117 return;
4118 }
4119 return;
4120 }
4121 case Intrinsics::VectorPackSigned: {
4122 Operand *Src0 = Instr->getArg(0);
4123 Operand *Src1 = Instr->getArg(1);
4124 Variable *Dest = Instr->getDest();
4125 auto *T = makeReg(Src0->getType());
4126 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4127 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4128 _movp(T, Src0RM);
4129 _packss(T, Src1RM);
4130 _movp(Dest, T);
4131 return;
4132 }
4133 case Intrinsics::VectorPackUnsigned: {
4134 Operand *Src0 = Instr->getArg(0);
4135 Operand *Src1 = Instr->getArg(1);
4136 Variable *Dest = Instr->getDest();
4137 auto *T = makeReg(Src0->getType());
4138 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4139 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4140 _movp(T, Src0RM);
4141 _packus(T, Src1RM);
4142 _movp(Dest, T);
4143 return;
4144 }
4145 case Intrinsics::SignMask: {
4146 Operand *SrcReg = legalizeToReg(Instr->getArg(0));
4147 Variable *Dest = Instr->getDest();
4148 Variable *T = makeReg(IceType_i32);
4149 if (SrcReg->getType() == IceType_v4f32 ||
4150 SrcReg->getType() == IceType_v4i32 ||
4151 SrcReg->getType() == IceType_v16i8) {
4152 _movmsk(T, SrcReg);
4153 } else {
4154 // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
4155 llvm::report_fatal_error("Invalid type for SignMask intrinsic");
4156 }
4157 _mov(Dest, T);
4158 return;
4159 }
4160 case Intrinsics::MultiplyHighSigned: {
4161 Operand *Src0 = Instr->getArg(0);
4162 Operand *Src1 = Instr->getArg(1);
4163 Variable *Dest = Instr->getDest();
4164 auto *T = makeReg(Dest->getType());
4165 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4166 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4167 _movp(T, Src0RM);
4168 _pmulhw(T, Src1RM);
4169 _movp(Dest, T);
4170 return;
4171 }
4172 case Intrinsics::MultiplyHighUnsigned: {
4173 Operand *Src0 = Instr->getArg(0);
4174 Operand *Src1 = Instr->getArg(1);
4175 Variable *Dest = Instr->getDest();
4176 auto *T = makeReg(Dest->getType());
4177 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4178 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4179 _movp(T, Src0RM);
4180 _pmulhuw(T, Src1RM);
4181 _movp(Dest, T);
4182 return;
4183 }
4184 case Intrinsics::MultiplyAddPairs: {
4185 Operand *Src0 = Instr->getArg(0);
4186 Operand *Src1 = Instr->getArg(1);
4187 Variable *Dest = Instr->getDest();
4188 auto *T = makeReg(Dest->getType());
4189 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4190 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4191 _movp(T, Src0RM);
4192 _pmaddwd(T, Src1RM);
4193 _movp(Dest, T);
4194 return;
4195 }
4196 case Intrinsics::AddSaturateSigned: {
4197 Operand *Src0 = Instr->getArg(0);
4198 Operand *Src1 = Instr->getArg(1);
4199 Variable *Dest = Instr->getDest();
4200 auto *T = makeReg(Dest->getType());
4201 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4202 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4203 _movp(T, Src0RM);
4204 _padds(T, Src1RM);
4205 _movp(Dest, T);
4206 return;
4207 }
4208 case Intrinsics::SubtractSaturateSigned: {
4209 Operand *Src0 = Instr->getArg(0);
4210 Operand *Src1 = Instr->getArg(1);
4211 Variable *Dest = Instr->getDest();
4212 auto *T = makeReg(Dest->getType());
4213 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4214 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4215 _movp(T, Src0RM);
4216 _psubs(T, Src1RM);
4217 _movp(Dest, T);
4218 return;
4219 }
4220 case Intrinsics::AddSaturateUnsigned: {
4221 Operand *Src0 = Instr->getArg(0);
4222 Operand *Src1 = Instr->getArg(1);
4223 Variable *Dest = Instr->getDest();
4224 auto *T = makeReg(Dest->getType());
4225 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4226 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4227 _movp(T, Src0RM);
4228 _paddus(T, Src1RM);
4229 _movp(Dest, T);
4230 return;
4231 }
4232 case Intrinsics::SubtractSaturateUnsigned: {
4233 Operand *Src0 = Instr->getArg(0);
4234 Operand *Src1 = Instr->getArg(1);
4235 Variable *Dest = Instr->getDest();
4236 auto *T = makeReg(Dest->getType());
4237 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4238 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4239 _movp(T, Src0RM);
4240 _psubus(T, Src1RM);
4241 _movp(Dest, T);
4242 return;
4243 }
4244 case Intrinsics::Nearbyint: {
4245 Operand *Src = Instr->getArg(0);
4246 Variable *Dest = Instr->getDest();
4247 Type DestTy = Dest->getType();
4248 if (isVectorType(DestTy)) {
4249 assert(DestTy == IceType_v4i32);
4250 assert(Src->getType() == IceType_v4f32);
4251 Operand *Src0R = legalizeToReg(Src);
4252 Variable *T = makeReg(DestTy);
4253 _cvt(T, Src0R, Insts::Cvt::Ps2dq);
4254 _movp(Dest, T);
4255 } else if (DestTy == IceType_i64) {
4256 llvm::report_fatal_error("Helper call was expected");
4257 } else {
4258 Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
4259 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
4260 assert(DestTy != IceType_i64);
4261 Variable *T_1 = makeReg(IceType_i32);
4262 // cvt() requires its integer argument to be a GPR.
4263 Variable *T_2 = makeReg(DestTy);
4264 if (isByteSizedType(DestTy)) {
4265 assert(T_1->getType() == IceType_i32);
4266 T_1->setRegClass(RCX86_Is32To8);
4267 T_2->setRegClass(RCX86_IsTrunc8Rcvr);
4268 }
4269 _cvt(T_1, Src0RM, Insts::Cvt::Ss2si);
4270 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
4271 if (DestTy == IceType_i1)
4272 _and(T_2, Ctx->getConstantInt1(1));
4273 _mov(Dest, T_2);
4274 }
4275 return;
4276 }
4277 case Intrinsics::Round: {
4278 assert(InstructionSet >= SSE4_1);
4279 Variable *Dest = Instr->getDest();
4280 Operand *Src = Instr->getArg(0);
4281 Operand *Mode = Instr->getArg(1);
4282 assert(llvm::isa<ConstantInteger32>(Mode) &&
4283 "Round last argument must be a constant");
4284 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
4285 int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
4286 (void)Imm;
4287 assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
4288 auto *T = makeReg(Dest->getType());
4289 _round(T, SrcRM, Mode);
4290 _movp(Dest, T);
4291 return;
4292 }
4293 default: // UnknownIntrinsic
4294 Func->setError("Unexpected intrinsic");
4295 return;
4296 }
4297 return;
4298 }
4299
lowerAtomicCmpxchg(Variable * DestPrev,Operand * Ptr,Operand * Expected,Operand * Desired)4300 void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
4301 Operand *Expected, Operand *Desired) {
4302 Type Ty = Expected->getType();
4303 if (Ty == IceType_i64) {
4304 // Reserve the pre-colored registers first, before adding any more
4305 // infinite-weight variables from formMemoryOperand's legalization.
4306 Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
4307 Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
4308 Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
4309 Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
4310 _mov(T_eax, loOperand(Expected));
4311 _mov(T_edx, hiOperand(Expected));
4312 _mov(T_ebx, loOperand(Desired));
4313 _mov(T_ecx, hiOperand(Desired));
4314 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4315 constexpr bool Locked = true;
4316 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4317 auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
4318 auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
4319 _mov(DestLo, T_eax);
4320 _mov(DestHi, T_edx);
4321 return;
4322 }
4323 RegNumT Eax;
4324 switch (Ty) {
4325 default:
4326 llvm::report_fatal_error("Bad type for cmpxchg");
4327 case IceType_i32:
4328 Eax = RegX8632::Reg_eax;
4329 break;
4330 case IceType_i16:
4331 Eax = RegX8632::Reg_ax;
4332 break;
4333 case IceType_i8:
4334 Eax = RegX8632::Reg_al;
4335 break;
4336 }
4337 Variable *T_eax = makeReg(Ty, Eax);
4338 _mov(T_eax, Expected);
4339 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4340 Variable *DesiredReg = legalizeToReg(Desired);
4341 constexpr bool Locked = true;
4342 _cmpxchg(Addr, T_eax, DesiredReg, Locked);
4343 _mov(DestPrev, T_eax);
4344 }
4345
tryOptimizedCmpxchgCmpBr(Variable * Dest,Operand * PtrToMem,Operand * Expected,Operand * Desired)4346 bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
4347 Operand *Expected,
4348 Operand *Desired) {
4349 if (Func->getOptLevel() == Opt_m1)
4350 return false;
4351 // Peek ahead a few instructions and see how Dest is used.
4352 // It's very common to have:
4353 //
4354 // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
4355 // [%y_phi = ...] // list of phi stores
4356 // %p = icmp eq i32 %x, %expected
4357 // br i1 %p, label %l1, label %l2
4358 //
4359 // which we can optimize into:
4360 //
4361 // %x = <cmpxchg code>
4362 // [%y_phi = ...] // list of phi stores
4363 // br eq, %l1, %l2
4364 InstList::iterator I = Context.getCur();
4365 // I is currently the InstIntrinsic. Peek past that.
4366 // This assumes that the atomic cmpxchg has not been lowered yet,
4367 // so that the instructions seen in the scan from "Cur" is simple.
4368 assert(llvm::isa<InstIntrinsic>(*I));
4369 Inst *NextInst = Context.getNextInst(I);
4370 if (!NextInst)
4371 return false;
4372 // There might be phi assignments right before the compare+branch, since
4373 // this could be a backward branch for a loop. This placement of assignments
4374 // is determined by placePhiStores().
4375 CfgVector<InstAssign *> PhiAssigns;
4376 while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
4377 if (PhiAssign->getDest() == Dest)
4378 return false;
4379 PhiAssigns.push_back(PhiAssign);
4380 NextInst = Context.getNextInst(I);
4381 if (!NextInst)
4382 return false;
4383 }
4384 if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
4385 if (!(NextCmp->getCondition() == InstIcmp::Eq &&
4386 ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
4387 (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
4388 return false;
4389 }
4390 NextInst = Context.getNextInst(I);
4391 if (!NextInst)
4392 return false;
4393 if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
4394 if (!NextBr->isUnconditional() &&
4395 NextCmp->getDest() == NextBr->getCondition() &&
4396 NextBr->isLastUse(NextCmp->getDest())) {
4397 lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
4398 for (size_t i = 0; i < PhiAssigns.size(); ++i) {
4399 // Lower the phi assignments now, before the branch (same placement
4400 // as before).
4401 InstAssign *PhiAssign = PhiAssigns[i];
4402 PhiAssign->setDeleted();
4403 lowerAssign(PhiAssign);
4404 Context.advanceNext();
4405 }
4406 _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
4407 // Skip over the old compare and branch, by deleting them.
4408 NextCmp->setDeleted();
4409 NextBr->setDeleted();
4410 Context.advanceNext();
4411 Context.advanceNext();
4412 return true;
4413 }
4414 }
4415 }
4416 return false;
4417 }
4418
lowerAtomicRMW(Variable * Dest,uint32_t Operation,Operand * Ptr,Operand * Val)4419 void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
4420 Operand *Ptr, Operand *Val) {
4421 bool NeedsCmpxchg = false;
4422 LowerBinOp Op_Lo = nullptr;
4423 LowerBinOp Op_Hi = nullptr;
4424 switch (Operation) {
4425 default:
4426 Func->setError("Unknown AtomicRMW operation");
4427 return;
4428 case Intrinsics::AtomicAdd: {
4429 if (Dest->getType() == IceType_i64) {
4430 // All the fall-through paths must set this to true, but use this
4431 // for asserting.
4432 NeedsCmpxchg = true;
4433 Op_Lo = &TargetX8632::_add;
4434 Op_Hi = &TargetX8632::_adc;
4435 break;
4436 }
4437 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4438 constexpr bool Locked = true;
4439 Variable *T = nullptr;
4440 _mov(T, Val);
4441 _xadd(Addr, T, Locked);
4442 _mov(Dest, T);
4443 return;
4444 }
4445 case Intrinsics::AtomicSub: {
4446 if (Dest->getType() == IceType_i64) {
4447 NeedsCmpxchg = true;
4448 Op_Lo = &TargetX8632::_sub;
4449 Op_Hi = &TargetX8632::_sbb;
4450 break;
4451 }
4452 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4453 constexpr bool Locked = true;
4454 Variable *T = nullptr;
4455 _mov(T, Val);
4456 _neg(T);
4457 _xadd(Addr, T, Locked);
4458 _mov(Dest, T);
4459 return;
4460 }
4461 case Intrinsics::AtomicOr:
4462 // TODO(jvoung): If Dest is null or dead, then some of these
4463 // operations do not need an "exchange", but just a locked op.
4464 // That appears to be "worth" it for sub, or, and, and xor.
4465 // xadd is probably fine vs lock add for add, and xchg is fine
4466 // vs an atomic store.
4467 NeedsCmpxchg = true;
4468 Op_Lo = &TargetX8632::_or;
4469 Op_Hi = &TargetX8632::_or;
4470 break;
4471 case Intrinsics::AtomicAnd:
4472 NeedsCmpxchg = true;
4473 Op_Lo = &TargetX8632::_and;
4474 Op_Hi = &TargetX8632::_and;
4475 break;
4476 case Intrinsics::AtomicXor:
4477 NeedsCmpxchg = true;
4478 Op_Lo = &TargetX8632::_xor;
4479 Op_Hi = &TargetX8632::_xor;
4480 break;
4481 case Intrinsics::AtomicExchange:
4482 if (Dest->getType() == IceType_i64) {
4483 NeedsCmpxchg = true;
4484 // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
4485 // just need to be moved to the ecx and ebx registers.
4486 Op_Lo = nullptr;
4487 Op_Hi = nullptr;
4488 break;
4489 }
4490 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4491 Variable *T = nullptr;
4492 _mov(T, Val);
4493 _xchg(Addr, T);
4494 _mov(Dest, T);
4495 return;
4496 }
4497 // Otherwise, we need a cmpxchg loop.
4498 (void)NeedsCmpxchg;
4499 assert(NeedsCmpxchg);
4500 expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
4501 }
4502
expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,LowerBinOp Op_Hi,Variable * Dest,Operand * Ptr,Operand * Val)4503 void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
4504 Variable *Dest, Operand *Ptr,
4505 Operand *Val) {
4506 // Expand a more complex RMW operation as a cmpxchg loop:
4507 // For 64-bit:
4508 // mov eax, [ptr]
4509 // mov edx, [ptr + 4]
4510 // .LABEL:
4511 // mov ebx, eax
4512 // <Op_Lo> ebx, <desired_adj_lo>
4513 // mov ecx, edx
4514 // <Op_Hi> ecx, <desired_adj_hi>
4515 // lock cmpxchg8b [ptr]
4516 // jne .LABEL
4517 // mov <dest_lo>, eax
4518 // mov <dest_lo>, edx
4519 //
4520 // For 32-bit:
4521 // mov eax, [ptr]
4522 // .LABEL:
4523 // mov <reg>, eax
4524 // op <reg>, [desired_adj]
4525 // lock cmpxchg [ptr], <reg>
4526 // jne .LABEL
4527 // mov <dest>, eax
4528 //
4529 // If Op_{Lo,Hi} are nullptr, then just copy the value.
4530 Val = legalize(Val);
4531 Type Ty = Val->getType();
4532 if (Ty == IceType_i64) {
4533 Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
4534 Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
4535 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4536 _mov(T_eax, loOperand(Addr));
4537 _mov(T_edx, hiOperand(Addr));
4538 Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
4539 Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
4540 InstX86Label *Label = InstX86Label::create(Func, this);
4541 const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
4542 if (!IsXchg8b) {
4543 Context.insert(Label);
4544 _mov(T_ebx, T_eax);
4545 (this->*Op_Lo)(T_ebx, loOperand(Val));
4546 _mov(T_ecx, T_edx);
4547 (this->*Op_Hi)(T_ecx, hiOperand(Val));
4548 } else {
4549 // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
4550 // It just needs the Val loaded into ebx and ecx.
4551 // That can also be done before the loop.
4552 _mov(T_ebx, loOperand(Val));
4553 _mov(T_ecx, hiOperand(Val));
4554 Context.insert(Label);
4555 }
4556 constexpr bool Locked = true;
4557 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4558 _br(CondX86::Br_ne, Label);
4559 if (!IsXchg8b) {
4560 // If Val is a variable, model the extended live range of Val through
4561 // the end of the loop, since it will be re-used by the loop.
4562 if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4563 auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
4564 auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
4565 Context.insert<InstFakeUse>(ValLo);
4566 Context.insert<InstFakeUse>(ValHi);
4567 }
4568 } else {
4569 // For xchg, the loop is slightly smaller and ebx/ecx are used.
4570 Context.insert<InstFakeUse>(T_ebx);
4571 Context.insert<InstFakeUse>(T_ecx);
4572 }
4573 // The address base (if any) is also reused in the loop.
4574 if (Variable *Base = Addr->getBase())
4575 Context.insert<InstFakeUse>(Base);
4576 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4577 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4578 _mov(DestLo, T_eax);
4579 _mov(DestHi, T_edx);
4580 return;
4581 }
4582 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4583 RegNumT Eax;
4584 switch (Ty) {
4585 default:
4586 llvm::report_fatal_error("Bad type for atomicRMW");
4587 case IceType_i32:
4588 Eax = RegX8632::Reg_eax;
4589 break;
4590 case IceType_i16:
4591 Eax = RegX8632::Reg_ax;
4592 break;
4593 case IceType_i8:
4594 Eax = RegX8632::Reg_al;
4595 break;
4596 }
4597 Variable *T_eax = makeReg(Ty, Eax);
4598 _mov(T_eax, Addr);
4599 auto *Label = Context.insert<InstX86Label>(this);
4600 // We want to pick a different register for T than Eax, so don't use
4601 // _mov(T == nullptr, T_eax).
4602 Variable *T = makeReg(Ty);
4603 _mov(T, T_eax);
4604 (this->*Op_Lo)(T, Val);
4605 constexpr bool Locked = true;
4606 _cmpxchg(Addr, T_eax, T, Locked);
4607 _br(CondX86::Br_ne, Label);
4608 // If Val is a variable, model the extended live range of Val through
4609 // the end of the loop, since it will be re-used by the loop.
4610 if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4611 Context.insert<InstFakeUse>(ValVar);
4612 }
4613 // The address base (if any) is also reused in the loop.
4614 if (Variable *Base = Addr->getBase())
4615 Context.insert<InstFakeUse>(Base);
4616 _mov(Dest, T_eax);
4617 }
4618
4619 /// Lowers count {trailing, leading} zeros intrinsic.
4620 ///
4621 /// We could do constant folding here, but that should have
4622 /// been done by the front-end/middle-end optimizations.
4623
lowerCountZeros(bool Cttz,Type Ty,Variable * Dest,Operand * FirstVal,Operand * SecondVal)4624 void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
4625 Operand *FirstVal, Operand *SecondVal) {
4626 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
4627 // Then the instructions will handle the Val == 0 case much more simply
4628 // and won't require conversion from bit position to number of zeros.
4629 //
4630 // Otherwise:
4631 // bsr IF_NOT_ZERO, Val
4632 // mov T_DEST, ((Ty == i32) ? 63 : 127)
4633 // cmovne T_DEST, IF_NOT_ZERO
4634 // xor T_DEST, ((Ty == i32) ? 31 : 63)
4635 // mov DEST, T_DEST
4636 //
4637 // NOTE: T_DEST must be a register because cmov requires its dest to be a
4638 // register. Also, bsf and bsr require their dest to be a register.
4639 //
4640 // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
4641 // E.g., for 000... 00001100, bsr will say that the most significant bit
4642 // set is at position 3, while the number of leading zeros is 28. Xor is
4643 // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
4644 // all-zeros case).
4645 //
4646 // X8632 only: Similar for 64-bit, but start w/ speculating that the upper
4647 // 32 bits are all zero, and compute the result for that case (checking the
4648 // lower 32 bits). Then actually compute the result for the upper bits and
4649 // cmov in the result from the lower computation if the earlier speculation
4650 // was correct.
4651 //
4652 // Cttz, is similar, but uses bsf instead, and doesn't require the xor
4653 // bit position conversion, and the speculation is reversed.
4654
4655 // TODO(jpp): refactor this method.
4656 assert(Ty == IceType_i32 || Ty == IceType_i64);
4657 const Type DestTy = IceType_i32;
4658 Variable *T = makeReg(DestTy);
4659 Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
4660 if (Cttz) {
4661 _bsf(T, FirstValRM);
4662 } else {
4663 _bsr(T, FirstValRM);
4664 }
4665 Variable *T_Dest = makeReg(DestTy);
4666 Constant *_31 = Ctx->getConstantInt32(31);
4667 Constant *_32 = Ctx->getConstantInt(DestTy, 32);
4668 Constant *_63 = Ctx->getConstantInt(DestTy, 63);
4669 Constant *_64 = Ctx->getConstantInt(DestTy, 64);
4670 if (Cttz) {
4671 if (DestTy == IceType_i64) {
4672 _mov(T_Dest, _64);
4673 } else {
4674 _mov(T_Dest, _32);
4675 }
4676 } else {
4677 Constant *_127 = Ctx->getConstantInt(DestTy, 127);
4678 if (DestTy == IceType_i64) {
4679 _mov(T_Dest, _127);
4680 } else {
4681 _mov(T_Dest, _63);
4682 }
4683 }
4684 _cmov(T_Dest, T, CondX86::Br_ne);
4685 if (!Cttz) {
4686 if (DestTy == IceType_i64) {
4687 // Even though there's a _63 available at this point, that constant
4688 // might not be an i32, which will cause the xor emission to fail.
4689 Constant *_63 = Ctx->getConstantInt32(63);
4690 _xor(T_Dest, _63);
4691 } else {
4692 _xor(T_Dest, _31);
4693 }
4694 }
4695 if (Ty == IceType_i32) {
4696 _mov(Dest, T_Dest);
4697 return;
4698 }
4699 _add(T_Dest, _32);
4700 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4701 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4702 // Will be using "test" on this, so we need a registerized variable.
4703 Variable *SecondVar = legalizeToReg(SecondVal);
4704 Variable *T_Dest2 = makeReg(IceType_i32);
4705 if (Cttz) {
4706 _bsf(T_Dest2, SecondVar);
4707 } else {
4708 _bsr(T_Dest2, SecondVar);
4709 _xor(T_Dest2, _31);
4710 }
4711 _test(SecondVar, SecondVar);
4712 _cmov(T_Dest2, T_Dest, CondX86::Br_e);
4713 _mov(DestLo, T_Dest2);
4714 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
4715 }
4716
typedLoad(Type Ty,Variable * Dest,Variable * Base,Constant * Offset)4717 void TargetX8632::typedLoad(Type Ty, Variable *Dest, Variable *Base,
4718 Constant *Offset) {
4719 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4720 // legalize Mem properly.
4721 if (Offset)
4722 assert(!llvm::isa<ConstantRelocatable>(Offset));
4723
4724 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4725
4726 if (isVectorType(Ty))
4727 _movp(Dest, Mem);
4728 else if (Ty == IceType_f64)
4729 _movq(Dest, Mem);
4730 else
4731 _mov(Dest, Mem);
4732 }
4733
typedStore(Type Ty,Variable * Value,Variable * Base,Constant * Offset)4734 void TargetX8632::typedStore(Type Ty, Variable *Value, Variable *Base,
4735 Constant *Offset) {
4736 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4737 // legalize Mem properly.
4738 if (Offset)
4739 assert(!llvm::isa<ConstantRelocatable>(Offset));
4740
4741 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4742
4743 if (isVectorType(Ty))
4744 _storep(Value, Mem);
4745 else if (Ty == IceType_f64)
4746 _storeq(Value, Mem);
4747 else
4748 _store(Value, Mem);
4749 }
4750
copyMemory(Type Ty,Variable * Dest,Variable * Src,int32_t OffsetAmt)4751 void TargetX8632::copyMemory(Type Ty, Variable *Dest, Variable *Src,
4752 int32_t OffsetAmt) {
4753 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4754 // TODO(ascull): this or add nullptr test to _movp, _movq
4755 Variable *Data = makeReg(Ty);
4756
4757 typedLoad(Ty, Data, Src, Offset);
4758 typedStore(Ty, Data, Dest, Offset);
4759 }
4760
lowerMemcpy(Operand * Dest,Operand * Src,Operand * Count)4761 void TargetX8632::lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count) {
4762 // There is a load and store for each chunk in the unroll
4763 constexpr uint32_t BytesPerStorep = 16;
4764
4765 // Check if the operands are constants
4766 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4767 const bool IsCountConst = CountConst != nullptr;
4768 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4769
4770 if (shouldOptimizeMemIntrins() && IsCountConst &&
4771 CountValue <= BytesPerStorep * MEMCPY_UNROLL_LIMIT) {
4772 // Unlikely, but nothing to do if it does happen
4773 if (CountValue == 0)
4774 return;
4775
4776 Variable *SrcBase = legalizeToReg(Src);
4777 Variable *DestBase = legalizeToReg(Dest);
4778
4779 // Find the largest type that can be used and use it as much as possible
4780 // in reverse order. Then handle any remainder with overlapping copies.
4781 // Since the remainder will be at the end, there will be reduced pressure
4782 // on the memory unit as the accesses to the same memory are far apart.
4783 Type Ty = largestTypeInSize(CountValue);
4784 uint32_t TyWidth = typeWidthInBytes(Ty);
4785
4786 uint32_t RemainingBytes = CountValue;
4787 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4788 while (RemainingBytes >= TyWidth) {
4789 copyMemory(Ty, DestBase, SrcBase, Offset);
4790 RemainingBytes -= TyWidth;
4791 Offset -= TyWidth;
4792 }
4793
4794 if (RemainingBytes == 0)
4795 return;
4796
4797 // Lower the remaining bytes. Adjust to larger types in order to make use
4798 // of overlaps in the copies.
4799 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4800 Offset = CountValue - typeWidthInBytes(LeftOverTy);
4801 copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
4802 return;
4803 }
4804
4805 // Fall back on a function call
4806 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
4807 Call->addArg(Dest);
4808 Call->addArg(Src);
4809 Call->addArg(Count);
4810 lowerCall(Call);
4811 }
4812
lowerMemmove(Operand * Dest,Operand * Src,Operand * Count)4813 void TargetX8632::lowerMemmove(Operand *Dest, Operand *Src, Operand *Count) {
4814 // There is a load and store for each chunk in the unroll
4815 constexpr uint32_t BytesPerStorep = 16;
4816
4817 // Check if the operands are constants
4818 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4819 const bool IsCountConst = CountConst != nullptr;
4820 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4821
4822 if (shouldOptimizeMemIntrins() && IsCountConst &&
4823 CountValue <= BytesPerStorep * MEMMOVE_UNROLL_LIMIT) {
4824 // Unlikely, but nothing to do if it does happen
4825 if (CountValue == 0)
4826 return;
4827
4828 Variable *SrcBase = legalizeToReg(Src);
4829 Variable *DestBase = legalizeToReg(Dest);
4830
4831 std::tuple<Type, Constant *, Variable *> Moves[MEMMOVE_UNROLL_LIMIT];
4832 Constant *Offset;
4833 Variable *Reg;
4834
4835 // Copy the data into registers as the source and destination could
4836 // overlap so make sure not to clobber the memory. This also means
4837 // overlapping moves can be used as we are taking a safe snapshot of the
4838 // memory.
4839 Type Ty = largestTypeInSize(CountValue);
4840 uint32_t TyWidth = typeWidthInBytes(Ty);
4841
4842 uint32_t RemainingBytes = CountValue;
4843 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
4844 size_t N = 0;
4845 while (RemainingBytes >= TyWidth) {
4846 assert(N <= MEMMOVE_UNROLL_LIMIT);
4847 Offset = Ctx->getConstantInt32(OffsetAmt);
4848 Reg = makeReg(Ty);
4849 typedLoad(Ty, Reg, SrcBase, Offset);
4850 RemainingBytes -= TyWidth;
4851 OffsetAmt -= TyWidth;
4852 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4853 }
4854
4855 if (RemainingBytes != 0) {
4856 // Lower the remaining bytes. Adjust to larger types in order to make
4857 // use of overlaps in the copies.
4858 assert(N <= MEMMOVE_UNROLL_LIMIT);
4859 Ty = firstTypeThatFitsSize(RemainingBytes);
4860 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
4861 Reg = makeReg(Ty);
4862 typedLoad(Ty, Reg, SrcBase, Offset);
4863 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4864 }
4865
4866 // Copy the data out into the destination memory
4867 for (size_t i = 0; i < N; ++i) {
4868 std::tie(Ty, Offset, Reg) = Moves[i];
4869 typedStore(Ty, Reg, DestBase, Offset);
4870 }
4871
4872 return;
4873 }
4874
4875 // Fall back on a function call
4876 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
4877 Call->addArg(Dest);
4878 Call->addArg(Src);
4879 Call->addArg(Count);
4880 lowerCall(Call);
4881 }
4882
lowerMemset(Operand * Dest,Operand * Val,Operand * Count)4883 void TargetX8632::lowerMemset(Operand *Dest, Operand *Val, Operand *Count) {
4884 constexpr uint32_t BytesPerStorep = 16;
4885 constexpr uint32_t BytesPerStoreq = 8;
4886 constexpr uint32_t BytesPerStorei32 = 4;
4887 assert(Val->getType() == IceType_i8);
4888
4889 // Check if the operands are constants
4890 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4891 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
4892 const bool IsCountConst = CountConst != nullptr;
4893 const bool IsValConst = ValConst != nullptr;
4894 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4895 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
4896
4897 // Unlikely, but nothing to do if it does happen
4898 if (IsCountConst && CountValue == 0)
4899 return;
4900
4901 // TODO(ascull): if the count is constant but val is not it would be
4902 // possible to inline by spreading the value across 4 bytes and accessing
4903 // subregs e.g. eax, ax and al.
4904 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
4905 Variable *Base = nullptr;
4906 Variable *VecReg = nullptr;
4907 const uint32_t MaskValue = (ValValue & 0xff);
4908 const uint32_t SpreadValue =
4909 (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
4910
4911 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
4912 uint32_t OffsetAmt) {
4913 assert(Base != nullptr);
4914 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4915
4916 // TODO(ascull): is 64-bit better with vector or scalar movq?
4917 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4918 if (isVectorType(Ty)) {
4919 assert(VecReg != nullptr);
4920 _storep(VecReg, Mem);
4921 } else if (Ty == IceType_f64) {
4922 assert(VecReg != nullptr);
4923 _storeq(VecReg, Mem);
4924 } else {
4925 assert(Ty != IceType_i64);
4926 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
4927 }
4928 };
4929
4930 // Find the largest type that can be used and use it as much as possible
4931 // in reverse order. Then handle any remainder with overlapping copies.
4932 // Since the remainder will be at the end, there will be reduces pressure
4933 // on the memory unit as the access to the same memory are far apart.
4934 Type Ty = IceType_void;
4935 if (ValValue == 0 && CountValue >= BytesPerStoreq &&
4936 CountValue <= BytesPerStorep * MEMSET_UNROLL_LIMIT) {
4937 // When the value is zero it can be loaded into a vector register
4938 // cheaply using the xor trick.
4939 Base = legalizeToReg(Dest);
4940 VecReg = makeVectorOfZeros(IceType_v16i8);
4941 Ty = largestTypeInSize(CountValue);
4942 } else if (CountValue <= BytesPerStorei32 * MEMSET_UNROLL_LIMIT) {
4943 // When the value is non-zero or the count is small we can't use vector
4944 // instructions so are limited to 32-bit stores.
4945 Base = legalizeToReg(Dest);
4946 constexpr uint32_t MaxSize = 4;
4947 Ty = largestTypeInSize(CountValue, MaxSize);
4948 }
4949
4950 if (Base) {
4951 uint32_t TyWidth = typeWidthInBytes(Ty);
4952
4953 uint32_t RemainingBytes = CountValue;
4954 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4955 while (RemainingBytes >= TyWidth) {
4956 lowerSet(Ty, Offset);
4957 RemainingBytes -= TyWidth;
4958 Offset -= TyWidth;
4959 }
4960
4961 if (RemainingBytes == 0)
4962 return;
4963
4964 // Lower the remaining bytes. Adjust to larger types in order to make
4965 // use of overlaps in the copies.
4966 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4967 Offset = CountValue - typeWidthInBytes(LeftOverTy);
4968 lowerSet(LeftOverTy, Offset);
4969 return;
4970 }
4971 }
4972
4973 // Fall back on calling the memset function. The value operand needs to be
4974 // extended to a stack slot size because the PNaCl ABI requires arguments to
4975 // be at least 32 bits wide.
4976 Operand *ValExt;
4977 if (IsValConst) {
4978 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
4979 } else {
4980 Variable *ValExtVar = Func->makeVariable(stackSlotType());
4981 lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
4982 ValExt = ValExtVar;
4983 }
4984 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
4985 Call->addArg(Dest);
4986 Call->addArg(ValExt);
4987 Call->addArg(Count);
4988 lowerCall(Call);
4989 }
4990
4991 class AddressOptimizer {
4992 AddressOptimizer() = delete;
4993 AddressOptimizer(const AddressOptimizer &) = delete;
4994 AddressOptimizer &operator=(const AddressOptimizer &) = delete;
4995
4996 public:
AddressOptimizer(const Cfg * Func)4997 explicit AddressOptimizer(const Cfg *Func)
4998 : Func(Func), VMetadata(Func->getVMetadata()) {}
4999
5000 inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
5001 int32_t Offset, const Variable *Base,
5002 const Variable *Index, uint16_t Shift,
5003 const Inst *Reason) const;
5004
5005 inline const Inst *matchAssign(Variable **Var,
5006 ConstantRelocatable **Relocatable,
5007 int32_t *Offset);
5008
5009 inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
5010 uint16_t *Shift);
5011
5012 inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
5013
5014 inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
5015 const uint16_t Shift,
5016 ConstantRelocatable **Relocatable,
5017 int32_t *Offset);
5018
5019 private:
5020 const Cfg *const Func;
5021 const VariablesMetadata *const VMetadata;
5022
isAdd(const Inst * Instr)5023 static bool isAdd(const Inst *Instr) {
5024 if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
5025 return (Arith->getOp() == InstArithmetic::Add);
5026 }
5027 return false;
5028 }
5029 };
5030
dumpAddressOpt(const ConstantRelocatable * const Relocatable,int32_t Offset,const Variable * Base,const Variable * Index,uint16_t Shift,const Inst * Reason) const5031 void AddressOptimizer::dumpAddressOpt(
5032 const ConstantRelocatable *const Relocatable, int32_t Offset,
5033 const Variable *Base, const Variable *Index, uint16_t Shift,
5034 const Inst *Reason) const {
5035 if (!BuildDefs::dump())
5036 return;
5037 if (!Func->isVerbose(IceV_AddrOpt))
5038 return;
5039 OstreamLocker L(Func->getContext());
5040 Ostream &Str = Func->getContext()->getStrDump();
5041 Str << "Instruction: ";
5042 Reason->dumpDecorated(Func);
5043 Str << " results in Base=";
5044 if (Base)
5045 Base->dump(Func);
5046 else
5047 Str << "<null>";
5048 Str << ", Index=";
5049 if (Index)
5050 Index->dump(Func);
5051 else
5052 Str << "<null>";
5053 Str << ", Shift=" << Shift << ", Offset=" << Offset
5054 << ", Relocatable=" << Relocatable << "\n";
5055 }
5056
matchAssign(Variable ** Var,ConstantRelocatable ** Relocatable,int32_t * Offset)5057 const Inst *AddressOptimizer::matchAssign(Variable **Var,
5058 ConstantRelocatable **Relocatable,
5059 int32_t *Offset) {
5060 // Var originates from Var=SrcVar ==> set Var:=SrcVar
5061 if (*Var == nullptr)
5062 return nullptr;
5063 if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
5064 assert(!VMetadata->isMultiDef(*Var));
5065 if (llvm::isa<InstAssign>(VarAssign)) {
5066 Operand *SrcOp = VarAssign->getSrc(0);
5067 assert(SrcOp);
5068 if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
5069 if (!VMetadata->isMultiDef(SrcVar) &&
5070 // TODO: ensure SrcVar stays single-BB
5071 true) {
5072 *Var = SrcVar;
5073 return VarAssign;
5074 }
5075 } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
5076 int32_t MoreOffset = Const->getValue();
5077 if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
5078 return nullptr;
5079 *Var = nullptr;
5080 *Offset += MoreOffset;
5081 return VarAssign;
5082 } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
5083 if (*Relocatable == nullptr) {
5084 // It is always safe to fold a relocatable through assignment -- the
5085 // assignment frees a slot in the address operand that can be used
5086 // to hold the Sandbox Pointer -- if any.
5087 *Var = nullptr;
5088 *Relocatable = AddReloc;
5089 return VarAssign;
5090 }
5091 }
5092 }
5093 }
5094 return nullptr;
5095 }
5096
matchCombinedBaseIndex(Variable ** Base,Variable ** Index,uint16_t * Shift)5097 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
5098 Variable **Index,
5099 uint16_t *Shift) {
5100 // Index==nullptr && Base is Base=Var1+Var2 ==>
5101 // set Base=Var1, Index=Var2, Shift=0
5102 if (*Base == nullptr)
5103 return nullptr;
5104 if (*Index != nullptr)
5105 return nullptr;
5106 auto *BaseInst = VMetadata->getSingleDefinition(*Base);
5107 if (BaseInst == nullptr)
5108 return nullptr;
5109 assert(!VMetadata->isMultiDef(*Base));
5110 if (BaseInst->getSrcSize() < 2)
5111 return nullptr;
5112 if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
5113 if (VMetadata->isMultiDef(Var1))
5114 return nullptr;
5115 if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
5116 if (VMetadata->isMultiDef(Var2))
5117 return nullptr;
5118 if (isAdd(BaseInst) &&
5119 // TODO: ensure Var1 and Var2 stay single-BB
5120 true) {
5121 *Base = Var1;
5122 *Index = Var2;
5123 *Shift = 0; // should already have been 0
5124 return BaseInst;
5125 }
5126 }
5127 }
5128 return nullptr;
5129 }
5130
matchShiftedIndex(Variable ** Index,uint16_t * Shift)5131 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
5132 uint16_t *Shift) {
5133 // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
5134 // Index=Var, Shift+=log2(Const)
5135 if (*Index == nullptr)
5136 return nullptr;
5137 auto *IndexInst = VMetadata->getSingleDefinition(*Index);
5138 if (IndexInst == nullptr)
5139 return nullptr;
5140 assert(!VMetadata->isMultiDef(*Index));
5141
5142 // When using an unsigned 32-bit array index on x64, it gets zero-extended
5143 // before the shift & add. The explicit zero extension can be eliminated
5144 // because x86 32-bit operations automatically get zero-extended into the
5145 // corresponding 64-bit register.
5146 if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
5147 if (CastInst->getCastKind() == InstCast::Zext) {
5148 if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
5149 if (Var->getType() == IceType_i32 &&
5150 CastInst->getDest()->getType() == IceType_i64) {
5151 IndexInst = VMetadata->getSingleDefinition(Var);
5152 }
5153 }
5154 }
5155 }
5156
5157 if (IndexInst->getSrcSize() < 2)
5158 return nullptr;
5159 if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
5160 if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
5161 if (auto *Const =
5162 llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
5163 if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
5164 return nullptr;
5165 switch (ArithInst->getOp()) {
5166 default:
5167 return nullptr;
5168 case InstArithmetic::Mul: {
5169 uint32_t Mult = Const->getValue();
5170 uint32_t LogMult;
5171 switch (Mult) {
5172 case 1:
5173 LogMult = 0;
5174 break;
5175 case 2:
5176 LogMult = 1;
5177 break;
5178 case 4:
5179 LogMult = 2;
5180 break;
5181 case 8:
5182 LogMult = 3;
5183 break;
5184 default:
5185 return nullptr;
5186 }
5187 if (*Shift + LogMult <= 3) {
5188 *Index = Var;
5189 *Shift += LogMult;
5190 return IndexInst;
5191 }
5192 }
5193 case InstArithmetic::Shl: {
5194 uint32_t ShiftAmount = Const->getValue();
5195 switch (ShiftAmount) {
5196 case 0:
5197 case 1:
5198 case 2:
5199 case 3:
5200 break;
5201 default:
5202 return nullptr;
5203 }
5204 if (*Shift + ShiftAmount <= 3) {
5205 *Index = Var;
5206 *Shift += ShiftAmount;
5207 return IndexInst;
5208 }
5209 }
5210 }
5211 }
5212 }
5213 }
5214 return nullptr;
5215 }
5216
matchOffsetIndexOrBase(Variable ** IndexOrBase,const uint16_t Shift,ConstantRelocatable ** Relocatable,int32_t * Offset)5217 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
5218 Variable **IndexOrBase, const uint16_t Shift,
5219 ConstantRelocatable **Relocatable, int32_t *Offset) {
5220 // Base is Base=Var+Const || Base is Base=Const+Var ==>
5221 // set Base=Var, Offset+=Const
5222 // Base is Base=Var-Const ==>
5223 // set Base=Var, Offset-=Const
5224 // Index is Index=Var+Const ==>
5225 // set Index=Var, Offset+=(Const<<Shift)
5226 // Index is Index=Const+Var ==>
5227 // set Index=Var, Offset+=(Const<<Shift)
5228 // Index is Index=Var-Const ==>
5229 // set Index=Var, Offset-=(Const<<Shift)
5230 // Treat Index=Var Or Const as Index=Var + Const
5231 // when Var = Var' << N and log2(Const) <= N
5232 // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
5233
5234 if (*IndexOrBase == nullptr) {
5235 return nullptr;
5236 }
5237 const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
5238 if (Definition == nullptr) {
5239 return nullptr;
5240 }
5241 assert(!VMetadata->isMultiDef(*IndexOrBase));
5242 if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
5243 switch (ArithInst->getOp()) {
5244 case InstArithmetic::Add:
5245 case InstArithmetic::Sub:
5246 case InstArithmetic::Or:
5247 break;
5248 default:
5249 return nullptr;
5250 }
5251
5252 Operand *Src0 = ArithInst->getSrc(0);
5253 Operand *Src1 = ArithInst->getSrc(1);
5254 auto *Var0 = llvm::dyn_cast<Variable>(Src0);
5255 auto *Var1 = llvm::dyn_cast<Variable>(Src1);
5256 auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
5257 auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
5258 auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
5259 auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
5260
5261 bool IsAdd = false;
5262 if (ArithInst->getOp() == InstArithmetic::Or) {
5263 Variable *Var = nullptr;
5264 ConstantInteger32 *Const = nullptr;
5265 if (Var0 && Const1) {
5266 Var = Var0;
5267 Const = Const1;
5268 } else if (Const0 && Var1) {
5269 Var = Var1;
5270 Const = Const0;
5271 } else {
5272 return nullptr;
5273 }
5274 auto *VarDef =
5275 llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
5276 if (VarDef == nullptr)
5277 return nullptr;
5278
5279 SizeT ZeroesAvailable = 0;
5280 if (VarDef->getOp() == InstArithmetic::Shl) {
5281 if (auto *ConstInt =
5282 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5283 ZeroesAvailable = ConstInt->getValue();
5284 }
5285 } else if (VarDef->getOp() == InstArithmetic::Mul) {
5286 SizeT PowerOfTwo = 0;
5287 if (auto *MultConst =
5288 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
5289 if (llvm::isPowerOf2_32(MultConst->getValue())) {
5290 PowerOfTwo += MultConst->getValue();
5291 }
5292 }
5293 if (auto *MultConst =
5294 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5295 if (llvm::isPowerOf2_32(MultConst->getValue())) {
5296 PowerOfTwo += MultConst->getValue();
5297 }
5298 }
5299 ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
5300 }
5301 SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
5302 if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
5303 return nullptr;
5304 IsAdd = true; // treat it as an add if the above conditions hold
5305 } else {
5306 IsAdd = ArithInst->getOp() == InstArithmetic::Add;
5307 }
5308
5309 Variable *NewIndexOrBase = nullptr;
5310 int32_t NewOffset = 0;
5311 ConstantRelocatable *NewRelocatable = *Relocatable;
5312 if (Var0 && Var1)
5313 // TODO(sehr): merge base/index splitting into here.
5314 return nullptr;
5315 if (!IsAdd && Var1)
5316 return nullptr;
5317 if (Var0)
5318 NewIndexOrBase = Var0;
5319 else if (Var1)
5320 NewIndexOrBase = Var1;
5321 // Don't know how to add/subtract two relocatables.
5322 if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
5323 return nullptr;
5324 // Don't know how to subtract a relocatable.
5325 if (!IsAdd && Reloc1)
5326 return nullptr;
5327 // Incorporate ConstantRelocatables.
5328 if (Reloc0)
5329 NewRelocatable = Reloc0;
5330 else if (Reloc1)
5331 NewRelocatable = Reloc1;
5332 // Compute the updated constant offset.
5333 if (Const0) {
5334 const int32_t MoreOffset =
5335 IsAdd ? Const0->getValue() : -Const0->getValue();
5336 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5337 return nullptr;
5338 NewOffset += MoreOffset;
5339 }
5340 if (Const1) {
5341 const int32_t MoreOffset =
5342 IsAdd ? Const1->getValue() : -Const1->getValue();
5343 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5344 return nullptr;
5345 NewOffset += MoreOffset;
5346 }
5347 if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
5348 return nullptr;
5349 *IndexOrBase = NewIndexOrBase;
5350 *Offset += (NewOffset << Shift);
5351 // Shift is always zero if this is called with the base
5352 *Relocatable = NewRelocatable;
5353 return Definition;
5354 }
5355 return nullptr;
5356 }
5357
computeAddressOpt(const Inst * Instr,Type MemType,Operand * Addr)5358 X86OperandMem *TargetX8632::computeAddressOpt(const Inst *Instr, Type MemType,
5359 Operand *Addr) {
5360 Func->resetCurrentNode();
5361 if (Func->isVerbose(IceV_AddrOpt)) {
5362 OstreamLocker L(Func->getContext());
5363 Ostream &Str = Func->getContext()->getStrDump();
5364 Str << "\nStarting computeAddressOpt for instruction:\n ";
5365 Instr->dumpDecorated(Func);
5366 }
5367
5368 OptAddr NewAddr;
5369 NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
5370 if (NewAddr.Base == nullptr)
5371 return nullptr;
5372
5373 // If the Base has more than one use or is live across multiple blocks, then
5374 // don't go further. Alternatively (?), never consider a transformation that
5375 // would change a variable that is currently *not* live across basic block
5376 // boundaries into one that *is*.
5377 if (!getFlags().getLoopInvariantCodeMotion()) {
5378 // Need multi block address opt when licm is enabled.
5379 // Might make sense to restrict to current node and loop header.
5380 if (Func->getVMetadata()->isMultiBlock(
5381 NewAddr.Base) /* || Base->getUseCount() > 1*/)
5382 return nullptr;
5383 }
5384 AddressOptimizer AddrOpt(Func);
5385 const bool MockBounds = getFlags().getMockBoundsCheck();
5386 const Inst *Reason = nullptr;
5387 bool AddressWasOptimized = false;
5388 // The following unnamed struct identifies the address mode formation steps
5389 // that could potentially create an invalid memory operand (i.e., no free
5390 // slots for RebasePtr.) We add all those variables to this struct so that
5391 // we can use memset() to reset all members to false.
5392 struct {
5393 bool AssignBase = false;
5394 bool AssignIndex = false;
5395 bool OffsetFromBase = false;
5396 bool OffsetFromIndex = false;
5397 bool CombinedBaseIndex = false;
5398 } Skip;
5399 // NewAddrCheckpoint is used to rollback the address being formed in case an
5400 // invalid address is formed.
5401 OptAddr NewAddrCheckpoint;
5402 Reason = Instr;
5403 do {
5404 if (Reason) {
5405 AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
5406 NewAddr.Index, NewAddr.Shift, Reason);
5407 AddressWasOptimized = true;
5408 Reason = nullptr;
5409 memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
5410 }
5411
5412 NewAddrCheckpoint = NewAddr;
5413
5414 // Update Base and Index to follow through assignments to definitions.
5415 if (!Skip.AssignBase &&
5416 (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
5417 &NewAddr.Offset))) {
5418 // Assignments of Base from a Relocatable or ConstantInt32 can result
5419 // in Base becoming nullptr. To avoid code duplication in this loop we
5420 // prefer that Base be non-nullptr if possible.
5421 if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
5422 NewAddr.Shift == 0) {
5423 std::swap(NewAddr.Base, NewAddr.Index);
5424 }
5425 continue;
5426 }
5427 if (!Skip.AssignBase &&
5428 (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
5429 &NewAddr.Offset))) {
5430 continue;
5431 }
5432
5433 if (!MockBounds) {
5434 // Transition from:
5435 // <Relocatable + Offset>(Base) to
5436 // <Relocatable + Offset>(Base, Index)
5437 if (!Skip.CombinedBaseIndex &&
5438 (Reason = AddrOpt.matchCombinedBaseIndex(
5439 &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
5440 continue;
5441 }
5442
5443 // Recognize multiply/shift and update Shift amount.
5444 // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
5445 // Index=Var, Shift+=Const
5446 // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
5447 // Index=Var, Shift+=log2(Const)
5448 if ((Reason =
5449 AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
5450 continue;
5451 }
5452
5453 // If Shift is zero, the choice of Base and Index was purely arbitrary.
5454 // Recognize multiply/shift and set Shift amount.
5455 // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
5456 // swap(Index,Base)
5457 // Similar for Base=Const*Var and Base=Var<<Const
5458 if (NewAddr.Shift == 0 &&
5459 (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
5460 std::swap(NewAddr.Base, NewAddr.Index);
5461 continue;
5462 }
5463 }
5464
5465 // Update Offset to reflect additions/subtractions with constants and
5466 // relocatables.
5467 // TODO: consider overflow issues with respect to Offset.
5468 if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
5469 &NewAddr.Base, /*Shift =*/0,
5470 &NewAddr.Relocatable, &NewAddr.Offset))) {
5471 continue;
5472 }
5473 if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
5474 &NewAddr.Index, NewAddr.Shift,
5475 &NewAddr.Relocatable, &NewAddr.Offset))) {
5476 continue;
5477 }
5478
5479 break;
5480 } while (Reason);
5481
5482 if (!AddressWasOptimized) {
5483 return nullptr;
5484 }
5485
5486 // Undo any addition of RebasePtr. It will be added back when the mem
5487 // operand is sandboxed.
5488 if (NewAddr.Base == RebasePtr) {
5489 NewAddr.Base = nullptr;
5490 }
5491
5492 if (NewAddr.Index == RebasePtr) {
5493 NewAddr.Index = nullptr;
5494 NewAddr.Shift = 0;
5495 }
5496
5497 Constant *OffsetOp = nullptr;
5498 if (NewAddr.Relocatable == nullptr) {
5499 OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
5500 } else {
5501 OffsetOp =
5502 Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
5503 NewAddr.Relocatable->getName());
5504 }
5505 // Vanilla ICE load instructions should not use the segment registers, and
5506 // computeAddressOpt only works at the level of Variables and Constants, not
5507 // other X86OperandMem, so there should be no mention of segment
5508 // registers there either.
5509 static constexpr auto SegmentReg =
5510 X86OperandMem::SegmentRegisters::DefaultSegment;
5511
5512 return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
5513 NewAddr.Index, NewAddr.Shift, SegmentReg);
5514 }
5515
5516 /// Add a mock bounds check on the memory address before using it as a load or
5517 /// store operand. The basic idea is that given a memory operand [reg], we
5518 /// would first add bounds-check code something like:
5519 ///
5520 /// cmp reg, <lb>
5521 /// jl out_of_line_error
5522 /// cmp reg, <ub>
5523 /// jg out_of_line_error
5524 ///
5525 /// In reality, the specific code will depend on how <lb> and <ub> are
5526 /// represented, e.g. an immediate, a global, or a function argument.
5527 ///
5528 /// As such, we need to enforce that the memory operand does not have the form
5529 /// [reg1+reg2], because then there is no simple cmp instruction that would
5530 /// suffice. However, we consider [reg+offset] to be OK because the offset is
5531 /// usually small, and so <ub> could have a safety buffer built in and then we
5532 /// could instead branch to a custom out_of_line_error that does the precise
5533 /// check and jumps back if it turns out OK.
5534 ///
5535 /// For the purpose of mocking the bounds check, we'll do something like this:
5536 ///
5537 /// cmp reg, 0
5538 /// je label
5539 /// cmp reg, 1
5540 /// je label
5541 /// label:
5542 ///
5543 /// Also note that we don't need to add a bounds check to a dereference of a
5544 /// simple global variable address.
5545
doMockBoundsCheck(Operand * Opnd)5546 void TargetX8632::doMockBoundsCheck(Operand *Opnd) {
5547 if (!getFlags().getMockBoundsCheck())
5548 return;
5549 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
5550 if (Mem->getIndex()) {
5551 llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
5552 }
5553 Opnd = Mem->getBase();
5554 }
5555 // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
5556 // something else. We only care if it is Variable.
5557 auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
5558 if (Var == nullptr)
5559 return;
5560 // We use lowerStore() to copy out-args onto the stack. This creates a
5561 // memory operand with the stack pointer as the base register. Don't do
5562 // bounds checks on that.
5563 if (Var->getRegNum() == getStackReg())
5564 return;
5565
5566 auto *Label = InstX86Label::create(Func, this);
5567 _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
5568 _br(CondX86::Br_e, Label);
5569 _cmp(Opnd, Ctx->getConstantInt32(1));
5570 _br(CondX86::Br_e, Label);
5571 Context.insert(Label);
5572 }
5573
lowerLoad(const InstLoad * Load)5574 void TargetX8632::lowerLoad(const InstLoad *Load) {
5575 // A Load instruction can be treated the same as an Assign instruction,
5576 // after the source operand is transformed into an X86OperandMem operand.
5577 // Note that the address mode optimization already creates an X86OperandMem
5578 // operand, so it doesn't need another level of transformation.
5579 Variable *DestLoad = Load->getDest();
5580 Type Ty = DestLoad->getType();
5581 Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
5582 doMockBoundsCheck(Src0);
5583 auto *Assign = InstAssign::create(Func, DestLoad, Src0);
5584 lowerAssign(Assign);
5585 }
5586
doAddressOptOther()5587 void TargetX8632::doAddressOptOther() {
5588 // Inverts some Icmp instructions which helps doAddressOptLoad later.
5589 // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
5590 Inst *Instr = iteratorToInst(Context.getCur());
5591 auto *VMetadata = Func->getVMetadata();
5592 if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
5593 if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
5594 llvm::isa<Constant>(Icmp->getSrc(1)))
5595 return;
5596 auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
5597 if (Var0 == nullptr)
5598 return;
5599 if (!VMetadata->isTracked(Var0))
5600 return;
5601 auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
5602 if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
5603 return;
5604 if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
5605 return;
5606
5607 auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
5608 if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
5609 auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
5610 if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
5611 llvm::isa<InstLoad>(Op1Def)) {
5612 return; // Both are loads
5613 }
5614 }
5615 Icmp->reverseConditionAndOperands();
5616 }
5617 }
5618
doAddressOptLoad()5619 void TargetX8632::doAddressOptLoad() {
5620 Inst *Instr = iteratorToInst(Context.getCur());
5621 Operand *Addr = Instr->getSrc(0);
5622 Variable *Dest = Instr->getDest();
5623 if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
5624 Instr->setDeleted();
5625 Context.insert<InstLoad>(Dest, OptAddr);
5626 }
5627 }
5628
doAddressOptLoadSubVector()5629 void TargetX8632::doAddressOptLoadSubVector() {
5630 auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
5631 Operand *Addr = Intrinsic->getArg(0);
5632 Variable *Dest = Intrinsic->getDest();
5633 if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
5634 Intrinsic->setDeleted();
5635 const Ice::Intrinsics::IntrinsicInfo Info = {
5636 Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
5637 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
5638 auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
5639 NewLoad->addArg(OptAddr);
5640 NewLoad->addArg(Intrinsic->getArg(1));
5641 }
5642 }
5643
lowerPhi(const InstPhi *)5644 void TargetX8632::lowerPhi(const InstPhi * /*Instr*/) {
5645 Func->setError("Phi found in regular instruction list");
5646 }
5647
lowerRet(const InstRet * Instr)5648 void TargetX8632::lowerRet(const InstRet *Instr) {
5649 Variable *Reg = nullptr;
5650 if (Instr->hasRetValue()) {
5651 Operand *RetValue = legalize(Instr->getRetValue());
5652 const Type ReturnType = RetValue->getType();
5653 assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
5654 (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
5655 Reg = moveReturnValueToRegister(RetValue, ReturnType);
5656 }
5657 // Add a ret instruction even if sandboxing is enabled, because addEpilog
5658 // explicitly looks for a ret instruction as a marker for where to insert
5659 // the frame removal instructions.
5660 _ret(Reg);
5661 // Add a fake use of esp to make sure esp stays alive for the entire
5662 // function. Otherwise post-call esp adjustments get dead-code eliminated.
5663 keepEspLiveAtExit();
5664 }
5665
makePshufdMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5666 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
5667 SizeT Index3) {
5668 const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
5669 ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
5670 assert(Mask < 256);
5671 return Mask;
5672 }
5673
lowerShuffleVector_AllFromSameSrc(Operand * Src,SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5674 Variable *TargetX8632::lowerShuffleVector_AllFromSameSrc(
5675 Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
5676 constexpr SizeT SrcBit = 1 << 2;
5677 assert((Index0 & SrcBit) == (Index1 & SrcBit));
5678 assert((Index0 & SrcBit) == (Index2 & SrcBit));
5679 assert((Index0 & SrcBit) == (Index3 & SrcBit));
5680 (void)SrcBit;
5681
5682 const Type SrcTy = Src->getType();
5683 auto *T = makeReg(SrcTy);
5684 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
5685 auto *Mask =
5686 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5687 _pshufd(T, SrcRM, Mask);
5688 return T;
5689 }
5690
5691 Variable *
lowerShuffleVector_TwoFromSameSrc(Operand * Src0,SizeT Index0,SizeT Index1,Operand * Src1,SizeT Index2,SizeT Index3)5692 TargetX8632::lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
5693 SizeT Index1, Operand *Src1,
5694 SizeT Index2, SizeT Index3) {
5695 constexpr SizeT SrcBit = 1 << 2;
5696 assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
5697 assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
5698 (void)SrcBit;
5699
5700 const Type SrcTy = Src0->getType();
5701 assert(Src1->getType() == SrcTy);
5702 auto *T = makeReg(SrcTy);
5703 auto *Src0R = legalizeToReg(Src0);
5704 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5705 auto *Mask =
5706 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5707 _movp(T, Src0R);
5708 _shufps(T, Src1RM, Mask);
5709 return T;
5710 }
5711
lowerShuffleVector_UnifyFromDifferentSrcs(Operand * Src0,SizeT Index0,Operand * Src1,SizeT Index1)5712 Variable *TargetX8632::lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
5713 SizeT Index0,
5714 Operand *Src1,
5715 SizeT Index1) {
5716 return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
5717 Index1, IGNORE_INDEX);
5718 }
5719
makeSrcSwitchMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5720 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
5721 SizeT Index3) {
5722 constexpr SizeT SrcBit = 1 << 2;
5723 const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
5724 const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
5725 const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
5726 const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
5727 return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
5728 }
5729
lowerShuffleVector_NewMaskName()5730 GlobalString TargetX8632::lowerShuffleVector_NewMaskName() {
5731 GlobalString FuncName = Func->getFunctionName();
5732 const SizeT Id = PshufbMaskCount++;
5733 if (!BuildDefs::dump() || !FuncName.hasStdString()) {
5734 return GlobalString::createWithString(
5735 Ctx,
5736 "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
5737 }
5738 return GlobalString::createWithString(
5739 Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
5740 }
5741
lowerShuffleVector_CreatePshufbMask(int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5742 ConstantRelocatable *TargetX8632::lowerShuffleVector_CreatePshufbMask(
5743 int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
5744 int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
5745 int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
5746 int8_t Idx15) {
5747 static constexpr uint8_t NumElements = 16;
5748 const char Initializer[NumElements] = {
5749 Idx0, Idx1, Idx2, Idx3, Idx4, Idx5, Idx6, Idx7,
5750 Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
5751 };
5752
5753 static constexpr Type V4VectorType = IceType_v4i32;
5754 const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
5755 auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
5756 GlobalString MaskName = lowerShuffleVector_NewMaskName();
5757 Mask->setIsConstant(true);
5758 Mask->addInitializer(VariableDeclaration::DataInitializer::create(
5759 Func->getGlobalPool(), Initializer, NumElements));
5760 Mask->setName(MaskName);
5761 // Mask needs to be 16-byte aligned, or pshufb will seg fault.
5762 Mask->setAlignment(MaskAlignment);
5763 Func->addGlobal(Mask);
5764
5765 constexpr RelocOffsetT Offset = 0;
5766 return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
5767 }
5768
lowerShuffleVector_UsingPshufb(Variable * Dest,Operand * Src0,Operand * Src1,int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5769 void TargetX8632::lowerShuffleVector_UsingPshufb(
5770 Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
5771 int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
5772 int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
5773 int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
5774 const Type DestTy = Dest->getType();
5775 static constexpr bool NotRebased = false;
5776 static constexpr Variable *NoBase = nullptr;
5777 // We use void for the memory operand instead of DestTy because using the
5778 // latter causes a validation failure: the X86 Inst layer complains that
5779 // vector mem operands could be under aligned. Thus, using void we avoid the
5780 // validation error. Note that the mask global declaration is aligned, so it
5781 // can be used as an XMM mem operand.
5782 static constexpr Type MaskType = IceType_void;
5783 #define IDX_IN_SRC(N, S) \
5784 ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
5785 auto *Mask0M = X86OperandMem::create(
5786 Func, MaskType, NoBase,
5787 lowerShuffleVector_CreatePshufbMask(
5788 IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
5789 IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
5790 IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
5791 IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
5792 IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
5793 IDX_IN_SRC(Idx15, 0)),
5794 NotRebased);
5795
5796 auto *T0 = makeReg(DestTy);
5797 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5798 _movp(T0, Src0RM);
5799
5800 _pshufb(T0, Mask0M);
5801
5802 if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
5803 Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
5804 Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
5805 Idx15 >= 16) {
5806 auto *Mask1M = X86OperandMem::create(
5807 Func, MaskType, NoBase,
5808 lowerShuffleVector_CreatePshufbMask(
5809 IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
5810 IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
5811 IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
5812 IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
5813 IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
5814 IDX_IN_SRC(Idx15, 1)),
5815 NotRebased);
5816 #undef IDX_IN_SRC
5817 auto *T1 = makeReg(DestTy);
5818 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5819 _movp(T1, Src1RM);
5820 _pshufb(T1, Mask1M);
5821 _por(T0, T1);
5822 }
5823
5824 _movp(Dest, T0);
5825 }
5826
lowerShuffleVector(const InstShuffleVector * Instr)5827 void TargetX8632::lowerShuffleVector(const InstShuffleVector *Instr) {
5828 auto *Dest = Instr->getDest();
5829 const Type DestTy = Dest->getType();
5830 auto *Src0 = Instr->getSrc(0);
5831 auto *Src1 = Instr->getSrc(1);
5832 const SizeT NumElements = typeNumElements(DestTy);
5833
5834 auto *T = makeReg(DestTy);
5835
5836 switch (DestTy) {
5837 default:
5838 llvm::report_fatal_error("Unexpected vector type.");
5839 case IceType_v16i1:
5840 case IceType_v16i8: {
5841 static constexpr SizeT ExpectedNumElements = 16;
5842 assert(ExpectedNumElements == Instr->getNumIndexes());
5843 (void)ExpectedNumElements;
5844
5845 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
5846 auto *T = makeReg(DestTy);
5847 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5848 _movp(T, Src0RM);
5849 _punpckl(T, Src0RM);
5850 _movp(Dest, T);
5851 return;
5852 }
5853
5854 if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
5855 23)) {
5856 auto *T = makeReg(DestTy);
5857 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5858 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5859 _movp(T, Src0RM);
5860 _punpckl(T, Src1RM);
5861 _movp(Dest, T);
5862 return;
5863 }
5864
5865 if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
5866 15, 15)) {
5867 auto *T = makeReg(DestTy);
5868 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5869 _movp(T, Src0RM);
5870 _punpckh(T, Src0RM);
5871 _movp(Dest, T);
5872 return;
5873 }
5874
5875 if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
5876 15, 31)) {
5877 auto *T = makeReg(DestTy);
5878 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5879 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5880 _movp(T, Src0RM);
5881 _punpckh(T, Src1RM);
5882 _movp(Dest, T);
5883 return;
5884 }
5885
5886 if (InstructionSet < SSE4_1) {
5887 // TODO(jpp): figure out how to lower with sse2.
5888 break;
5889 }
5890
5891 const SizeT Index0 = Instr->getIndexValue(0);
5892 const SizeT Index1 = Instr->getIndexValue(1);
5893 const SizeT Index2 = Instr->getIndexValue(2);
5894 const SizeT Index3 = Instr->getIndexValue(3);
5895 const SizeT Index4 = Instr->getIndexValue(4);
5896 const SizeT Index5 = Instr->getIndexValue(5);
5897 const SizeT Index6 = Instr->getIndexValue(6);
5898 const SizeT Index7 = Instr->getIndexValue(7);
5899 const SizeT Index8 = Instr->getIndexValue(8);
5900 const SizeT Index9 = Instr->getIndexValue(9);
5901 const SizeT Index10 = Instr->getIndexValue(10);
5902 const SizeT Index11 = Instr->getIndexValue(11);
5903 const SizeT Index12 = Instr->getIndexValue(12);
5904 const SizeT Index13 = Instr->getIndexValue(13);
5905 const SizeT Index14 = Instr->getIndexValue(14);
5906 const SizeT Index15 = Instr->getIndexValue(15);
5907
5908 lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
5909 Index3, Index4, Index5, Index6, Index7,
5910 Index8, Index9, Index10, Index11, Index12,
5911 Index13, Index14, Index15);
5912 return;
5913 }
5914 case IceType_v8i1:
5915 case IceType_v8i16: {
5916 static constexpr SizeT ExpectedNumElements = 8;
5917 assert(ExpectedNumElements == Instr->getNumIndexes());
5918 (void)ExpectedNumElements;
5919
5920 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
5921 auto *T = makeReg(DestTy);
5922 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5923 _movp(T, Src0RM);
5924 _punpckl(T, Src0RM);
5925 _movp(Dest, T);
5926 return;
5927 }
5928
5929 if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
5930 auto *T = makeReg(DestTy);
5931 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5932 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5933 _movp(T, Src0RM);
5934 _punpckl(T, Src1RM);
5935 _movp(Dest, T);
5936 return;
5937 }
5938
5939 if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
5940 auto *T = makeReg(DestTy);
5941 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5942 _movp(T, Src0RM);
5943 _punpckh(T, Src0RM);
5944 _movp(Dest, T);
5945 return;
5946 }
5947
5948 if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
5949 auto *T = makeReg(DestTy);
5950 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5951 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5952 _movp(T, Src0RM);
5953 _punpckh(T, Src1RM);
5954 _movp(Dest, T);
5955 return;
5956 }
5957
5958 if (InstructionSet < SSE4_1) {
5959 // TODO(jpp): figure out how to lower with sse2.
5960 break;
5961 }
5962
5963 const SizeT Index0 = Instr->getIndexValue(0);
5964 const SizeT Index1 = Instr->getIndexValue(1);
5965 const SizeT Index2 = Instr->getIndexValue(2);
5966 const SizeT Index3 = Instr->getIndexValue(3);
5967 const SizeT Index4 = Instr->getIndexValue(4);
5968 const SizeT Index5 = Instr->getIndexValue(5);
5969 const SizeT Index6 = Instr->getIndexValue(6);
5970 const SizeT Index7 = Instr->getIndexValue(7);
5971
5972 #define TO_BYTE_INDEX(I) ((I) << 1)
5973 lowerShuffleVector_UsingPshufb(
5974 Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
5975 TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
5976 TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
5977 TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
5978 TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
5979 TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
5980 TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
5981 TO_BYTE_INDEX(Index7) + 1);
5982 #undef TO_BYTE_INDEX
5983 return;
5984 }
5985 case IceType_v4i1:
5986 case IceType_v4i32:
5987 case IceType_v4f32: {
5988 static constexpr SizeT ExpectedNumElements = 4;
5989 assert(ExpectedNumElements == Instr->getNumIndexes());
5990 const SizeT Index0 = Instr->getIndexValue(0);
5991 const SizeT Index1 = Instr->getIndexValue(1);
5992 const SizeT Index2 = Instr->getIndexValue(2);
5993 const SizeT Index3 = Instr->getIndexValue(3);
5994 Variable *T = nullptr;
5995 switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
5996 #define CASE_SRCS_IN(S0, S1, S2, S3) \
5997 case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
5998 CASE_SRCS_IN(0, 0, 0, 0) : {
5999 T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
6000 Index3);
6001 }
6002 break;
6003 CASE_SRCS_IN(0, 0, 0, 1) : {
6004 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6005 Src1, Index3);
6006 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6007 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6008 }
6009 break;
6010 CASE_SRCS_IN(0, 0, 1, 0) : {
6011 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6012 Src0, Index3);
6013 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6014 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6015 }
6016 break;
6017 CASE_SRCS_IN(0, 0, 1, 1) : {
6018 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
6019 Index2, Index3);
6020 }
6021 break;
6022 CASE_SRCS_IN(0, 1, 0, 0) : {
6023 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6024 Src1, Index1);
6025 T = lowerShuffleVector_TwoFromSameSrc(
6026 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6027 }
6028 break;
6029 CASE_SRCS_IN(0, 1, 0, 1) : {
6030 if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
6031 (Index3 - ExpectedNumElements) == 1) {
6032 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6033 auto *Src0R = legalizeToReg(Src0);
6034 T = makeReg(DestTy);
6035 _movp(T, Src0R);
6036 _punpckl(T, Src1RM);
6037 } else if (Index0 == Index2 && Index1 == Index3) {
6038 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6039 Src0, Index0, Src1, Index1);
6040 T = lowerShuffleVector_AllFromSameSrc(
6041 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6042 UNIFIED_INDEX_1);
6043 } else {
6044 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6045 Src0, Index0, Src1, Index1);
6046 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6047 Src0, Index2, Src1, Index3);
6048 T = lowerShuffleVector_TwoFromSameSrc(
6049 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6050 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6051 }
6052 }
6053 break;
6054 CASE_SRCS_IN(0, 1, 1, 0) : {
6055 if (Index0 == Index3 && Index1 == Index2) {
6056 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6057 Src0, Index0, Src1, Index1);
6058 T = lowerShuffleVector_AllFromSameSrc(
6059 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6060 UNIFIED_INDEX_0);
6061 } else {
6062 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6063 Src0, Index0, Src1, Index1);
6064 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6065 Src1, Index2, Src0, Index3);
6066 T = lowerShuffleVector_TwoFromSameSrc(
6067 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6068 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6069 }
6070 }
6071 break;
6072 CASE_SRCS_IN(0, 1, 1, 1) : {
6073 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6074 Src1, Index1);
6075 T = lowerShuffleVector_TwoFromSameSrc(
6076 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6077 }
6078 break;
6079 CASE_SRCS_IN(1, 0, 0, 0) : {
6080 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6081 Src0, Index1);
6082 T = lowerShuffleVector_TwoFromSameSrc(
6083 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6084 }
6085 break;
6086 CASE_SRCS_IN(1, 0, 0, 1) : {
6087 if (Index0 == Index3 && Index1 == Index2) {
6088 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6089 Src1, Index0, Src0, Index1);
6090 T = lowerShuffleVector_AllFromSameSrc(
6091 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6092 UNIFIED_INDEX_0);
6093 } else {
6094 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6095 Src1, Index0, Src0, Index1);
6096 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6097 Src0, Index2, Src1, Index3);
6098 T = lowerShuffleVector_TwoFromSameSrc(
6099 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6100 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6101 }
6102 }
6103 break;
6104 CASE_SRCS_IN(1, 0, 1, 0) : {
6105 if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
6106 (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
6107 auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
6108 auto *Src0R = legalizeToReg(Src1);
6109 T = makeReg(DestTy);
6110 _movp(T, Src0R);
6111 _punpckl(T, Src1RM);
6112 } else if (Index0 == Index2 && Index1 == Index3) {
6113 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6114 Src1, Index0, Src0, Index1);
6115 T = lowerShuffleVector_AllFromSameSrc(
6116 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6117 UNIFIED_INDEX_1);
6118 } else {
6119 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6120 Src1, Index0, Src0, Index1);
6121 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6122 Src1, Index2, Src0, Index3);
6123 T = lowerShuffleVector_TwoFromSameSrc(
6124 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6125 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6126 }
6127 }
6128 break;
6129 CASE_SRCS_IN(1, 0, 1, 1) : {
6130 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6131 Src0, Index1);
6132 T = lowerShuffleVector_TwoFromSameSrc(
6133 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6134 }
6135 break;
6136 CASE_SRCS_IN(1, 1, 0, 0) : {
6137 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
6138 Index2, Index3);
6139 }
6140 break;
6141 CASE_SRCS_IN(1, 1, 0, 1) : {
6142 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6143 Src1, Index3);
6144 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6145 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6146 }
6147 break;
6148 CASE_SRCS_IN(1, 1, 1, 0) : {
6149 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6150 Src0, Index3);
6151 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6152 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6153 }
6154 break;
6155 CASE_SRCS_IN(1, 1, 1, 1) : {
6156 T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
6157 Index3);
6158 }
6159 break;
6160 #undef CASE_SRCS_IN
6161 }
6162
6163 assert(T != nullptr);
6164 assert(T->getType() == DestTy);
6165 _movp(Dest, T);
6166 return;
6167 } break;
6168 }
6169
6170 // Unoptimized shuffle. Perform a series of inserts and extracts.
6171 Context.insert<InstFakeDef>(T);
6172 const Type ElementType = typeElementType(DestTy);
6173 for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
6174 auto *Index = Instr->getIndex(I);
6175 const SizeT Elem = Index->getValue();
6176 auto *ExtElmt = makeReg(ElementType);
6177 if (Elem < NumElements) {
6178 lowerExtractElement(
6179 InstExtractElement::create(Func, ExtElmt, Src0, Index));
6180 } else {
6181 lowerExtractElement(InstExtractElement::create(
6182 Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
6183 }
6184 auto *NewT = makeReg(DestTy);
6185 lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
6186 Ctx->getConstantInt32(I)));
6187 T = NewT;
6188 }
6189 _movp(Dest, T);
6190 }
6191
lowerSelect(const InstSelect * Select)6192 void TargetX8632::lowerSelect(const InstSelect *Select) {
6193 Variable *Dest = Select->getDest();
6194
6195 Operand *Condition = Select->getCondition();
6196 // Handle folding opportunities.
6197 if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
6198 assert(Producer->isDeleted());
6199 switch (BoolFolding::getProducerKind(Producer)) {
6200 default:
6201 break;
6202 case BoolFolding::PK_Icmp32:
6203 case BoolFolding::PK_Icmp64: {
6204 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
6205 return;
6206 }
6207 case BoolFolding::PK_Fcmp: {
6208 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
6209 return;
6210 }
6211 }
6212 }
6213
6214 if (isVectorType(Dest->getType())) {
6215 lowerSelectVector(Select);
6216 return;
6217 }
6218
6219 Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
6220 Operand *Zero = Ctx->getConstantZero(IceType_i32);
6221 _cmp(CmpResult, Zero);
6222 Operand *SrcT = Select->getTrueOperand();
6223 Operand *SrcF = Select->getFalseOperand();
6224 const BrCond Cond = CondX86::Br_ne;
6225 lowerSelectMove(Dest, Cond, SrcT, SrcF);
6226 }
6227
lowerSelectMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)6228 void TargetX8632::lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
6229 Operand *SrcF) {
6230 Type DestTy = Dest->getType();
6231 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
6232 // The cmov instruction doesn't allow 8-bit or FP operands, so we need
6233 // explicit control flow.
6234 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
6235 auto *Label = InstX86Label::create(Func, this);
6236 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
6237 _mov(Dest, SrcT);
6238 _br(Cond, Label);
6239 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
6240 _redefined(_mov(Dest, SrcF));
6241 Context.insert(Label);
6242 return;
6243 }
6244 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
6245 // But if SrcT is immediate, we might be able to do better, as the cmov
6246 // instruction doesn't allow an immediate operand:
6247 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
6248 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
6249 std::swap(SrcT, SrcF);
6250 Cond = InstX86Base::getOppositeCondition(Cond);
6251 }
6252 if (DestTy == IceType_i64) {
6253 SrcT = legalizeUndef(SrcT);
6254 SrcF = legalizeUndef(SrcF);
6255 // Set the low portion.
6256 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6257 lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
6258 // Set the high portion.
6259 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6260 lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
6261 return;
6262 }
6263
6264 assert(DestTy == IceType_i16 || DestTy == IceType_i32);
6265 lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
6266 }
6267
lowerSelectIntMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)6268 void TargetX8632::lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
6269 Operand *SrcF) {
6270 Variable *T = nullptr;
6271 SrcF = legalize(SrcF);
6272 _mov(T, SrcF);
6273 SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
6274 _cmov(T, SrcT, Cond);
6275 _mov(Dest, T);
6276 }
6277
lowerMove(Variable * Dest,Operand * Src,bool IsRedefinition)6278 void TargetX8632::lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition) {
6279 assert(Dest->getType() == Src->getType());
6280 assert(!Dest->isRematerializable());
6281 if (Dest->getType() == IceType_i64) {
6282 Src = legalize(Src);
6283 Operand *SrcLo = loOperand(Src);
6284 Operand *SrcHi = hiOperand(Src);
6285 auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6286 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6287 Variable *T_Lo = nullptr, *T_Hi = nullptr;
6288 _mov(T_Lo, SrcLo);
6289 _redefined(_mov(DestLo, T_Lo), IsRedefinition);
6290 _mov(T_Hi, SrcHi);
6291 _redefined(_mov(DestHi, T_Hi), IsRedefinition);
6292 } else {
6293 Operand *SrcLegal;
6294 if (Dest->hasReg()) {
6295 // If Dest already has a physical register, then only basic legalization
6296 // is needed, as the source operand can be a register, immediate, or
6297 // memory.
6298 SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
6299 } else {
6300 // If Dest could be a stack operand, then RI must be a physical register
6301 // or a scalar integer immediate.
6302 SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
6303 }
6304 if (isVectorType(Dest->getType())) {
6305 _redefined(_movp(Dest, SrcLegal), IsRedefinition);
6306 } else {
6307 _redefined(_mov(Dest, SrcLegal), IsRedefinition);
6308 }
6309 }
6310 }
6311
lowerOptimizeFcmpSelect(const InstFcmp * Fcmp,const InstSelect * Select)6312 bool TargetX8632::lowerOptimizeFcmpSelect(const InstFcmp *Fcmp,
6313 const InstSelect *Select) {
6314 Operand *CmpSrc0 = Fcmp->getSrc(0);
6315 Operand *CmpSrc1 = Fcmp->getSrc(1);
6316 Operand *SelectSrcT = Select->getTrueOperand();
6317 Operand *SelectSrcF = Select->getFalseOperand();
6318 Variable *SelectDest = Select->getDest();
6319
6320 // TODO(capn): also handle swapped compare/select operand order.
6321 if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
6322 return false;
6323
6324 // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
6325 InstFcmp::FCond Condition = Fcmp->getCondition();
6326 switch (Condition) {
6327 default:
6328 return false;
6329 case InstFcmp::True:
6330 break;
6331 case InstFcmp::False:
6332 break;
6333 case InstFcmp::Ogt: {
6334 Variable *T = makeReg(SelectDest->getType());
6335 if (isScalarFloatingType(SelectSrcT->getType())) {
6336 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6337 _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6338 _mov(SelectDest, T);
6339 } else {
6340 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6341 _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6342 _movp(SelectDest, T);
6343 }
6344 return true;
6345 } break;
6346 case InstFcmp::Olt: {
6347 Variable *T = makeReg(SelectSrcT->getType());
6348 if (isScalarFloatingType(SelectSrcT->getType())) {
6349 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6350 _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6351 _mov(SelectDest, T);
6352 } else {
6353 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6354 _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6355 _movp(SelectDest, T);
6356 }
6357 return true;
6358 } break;
6359 }
6360 return false;
6361 }
6362
lowerIcmp(const InstIcmp * Icmp)6363 void TargetX8632::lowerIcmp(const InstIcmp *Icmp) {
6364 Variable *Dest = Icmp->getDest();
6365 if (isVectorType(Dest->getType())) {
6366 lowerIcmpVector(Icmp);
6367 } else {
6368 constexpr Inst *Consumer = nullptr;
6369 lowerIcmpAndConsumer(Icmp, Consumer);
6370 }
6371 }
6372
lowerSelectVector(const InstSelect * Instr)6373 void TargetX8632::lowerSelectVector(const InstSelect *Instr) {
6374 Variable *Dest = Instr->getDest();
6375 Type DestTy = Dest->getType();
6376 Operand *SrcT = Instr->getTrueOperand();
6377 Operand *SrcF = Instr->getFalseOperand();
6378 Operand *Condition = Instr->getCondition();
6379
6380 if (!isVectorType(DestTy))
6381 llvm::report_fatal_error("Expected a vector select");
6382
6383 Type SrcTy = SrcT->getType();
6384 Variable *T = makeReg(SrcTy);
6385 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
6386 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
6387
6388 if (InstructionSet >= SSE4_1) {
6389 // TODO(wala): If the condition operand is a constant, use blendps or
6390 // pblendw.
6391 //
6392 // Use blendvps or pblendvb to implement select.
6393 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
6394 SrcTy == IceType_v4f32) {
6395 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6396 Variable *xmm0 = makeReg(IceType_v4i32, RegX8632::Reg_xmm0);
6397 _movp(xmm0, ConditionRM);
6398 _psll(xmm0, Ctx->getConstantInt8(31));
6399 _movp(T, SrcFRM);
6400 _blendvps(T, SrcTRM, xmm0);
6401 _movp(Dest, T);
6402 } else {
6403 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
6404 Type SignExtTy =
6405 Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
6406 Variable *xmm0 = makeReg(SignExtTy, RegX8632::Reg_xmm0);
6407 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
6408 _movp(T, SrcFRM);
6409 _pblendvb(T, SrcTRM, xmm0);
6410 _movp(Dest, T);
6411 }
6412 return;
6413 }
6414 // Lower select without SSE4.1:
6415 // a=d?b:c ==>
6416 // if elementtype(d) != i1:
6417 // d=sext(d);
6418 // a=(b&d)|(c&~d);
6419 Variable *T2 = makeReg(SrcTy);
6420 // Sign extend the condition operand if applicable.
6421 if (SrcTy == IceType_v4f32) {
6422 // The sext operation takes only integer arguments.
6423 Variable *T3 = Func->makeVariable(IceType_v4i32);
6424 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
6425 _movp(T, T3);
6426 } else if (typeElementType(SrcTy) != IceType_i1) {
6427 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
6428 } else {
6429 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6430 _movp(T, ConditionRM);
6431 }
6432 _movp(T2, T);
6433 _pand(T, SrcTRM);
6434 _pandn(T2, SrcFRM);
6435 _por(T, T2);
6436 _movp(Dest, T);
6437
6438 return;
6439 }
6440
lowerStore(const InstStore * Instr)6441 void TargetX8632::lowerStore(const InstStore *Instr) {
6442 Operand *Value = Instr->getData();
6443 Operand *Addr = Instr->getStoreAddress();
6444 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
6445 doMockBoundsCheck(NewAddr);
6446 Type Ty = NewAddr->getType();
6447
6448 if (Ty == IceType_i64) {
6449 Value = legalizeUndef(Value);
6450 Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
6451 _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
6452 Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
6453 _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
6454 } else if (isVectorType(Ty)) {
6455 _storep(legalizeToReg(Value), NewAddr);
6456 } else {
6457 Value = legalize(Value, Legal_Reg | Legal_Imm);
6458 _store(Value, NewAddr);
6459 }
6460 }
6461
doAddressOptStore()6462 void TargetX8632::doAddressOptStore() {
6463 auto *Instr = llvm::cast<InstStore>(Context.getCur());
6464 Operand *Addr = Instr->getStoreAddress();
6465 Operand *Data = Instr->getData();
6466 if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
6467 Instr->setDeleted();
6468 auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
6469 if (Instr->getDest())
6470 NewStore->setRmwBeacon(Instr->getRmwBeacon());
6471 }
6472 }
6473
doAddressOptStoreSubVector()6474 void TargetX8632::doAddressOptStoreSubVector() {
6475 auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
6476 Operand *Addr = Intrinsic->getArg(1);
6477 Operand *Data = Intrinsic->getArg(0);
6478 if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
6479 Intrinsic->setDeleted();
6480 const Ice::Intrinsics::IntrinsicInfo Info = {
6481 Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
6482 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
6483 auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
6484 NewStore->addArg(Data);
6485 NewStore->addArg(OptAddr);
6486 NewStore->addArg(Intrinsic->getArg(2));
6487 }
6488 }
6489
lowerCmpRange(Operand * Comparison,uint64_t Min,uint64_t Max)6490 Operand *TargetX8632::lowerCmpRange(Operand *Comparison, uint64_t Min,
6491 uint64_t Max) {
6492 // TODO(ascull): 64-bit should not reach here but only because it is not
6493 // implemented yet. This should be able to handle the 64-bit case.
6494 assert(Comparison->getType() != IceType_i64);
6495 // Subtracting 0 is a nop so don't do it
6496 if (Min != 0) {
6497 // Avoid clobbering the comparison by copying it
6498 Variable *T = nullptr;
6499 _mov(T, Comparison);
6500 _sub(T, Ctx->getConstantInt32(Min));
6501 Comparison = T;
6502 }
6503
6504 _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
6505
6506 return Comparison;
6507 }
6508
lowerCaseCluster(const CaseCluster & Case,Operand * Comparison,bool DoneCmp,CfgNode * DefaultTarget)6509 void TargetX8632::lowerCaseCluster(const CaseCluster &Case, Operand *Comparison,
6510 bool DoneCmp, CfgNode *DefaultTarget) {
6511 switch (Case.getKind()) {
6512 case CaseCluster::JumpTable: {
6513 InstX86Label *SkipJumpTable;
6514
6515 Operand *RangeIndex =
6516 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
6517 if (DefaultTarget == nullptr) {
6518 // Skip over jump table logic if comparison not in range and no default
6519 SkipJumpTable = InstX86Label::create(Func, this);
6520 _br(CondX86::Br_a, SkipJumpTable);
6521 } else {
6522 _br(CondX86::Br_a, DefaultTarget);
6523 }
6524
6525 InstJumpTable *JumpTable = Case.getJumpTable();
6526 Context.insert(JumpTable);
6527
6528 // Make sure the index is a register of the same width as the base
6529 Variable *Index;
6530 const Type PointerType = IceType_i32;
6531 if (RangeIndex->getType() != PointerType) {
6532 Index = makeReg(PointerType);
6533 assert(RangeIndex->getType() != IceType_i64);
6534 Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
6535 _movzx(Index, RangeIndexRM);
6536 } else {
6537 Index = legalizeToReg(RangeIndex);
6538 }
6539
6540 constexpr RelocOffsetT RelocOffset = 0;
6541 constexpr Variable *NoBase = nullptr;
6542 constexpr Constant *NoOffset = nullptr;
6543 auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
6544 Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
6545 uint16_t Shift = typeWidthInBytesLog2(PointerType);
6546 constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
6547
6548 Variable *Target = nullptr;
6549 if (PointerType == IceType_i32) {
6550 _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
6551 Index, Shift, Segment));
6552 } else {
6553 auto *Base = makeReg(IceType_i64);
6554 _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
6555 _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
6556 Index, Shift, Segment));
6557 }
6558
6559 lowerIndirectJump(Target);
6560
6561 if (DefaultTarget == nullptr)
6562 Context.insert(SkipJumpTable);
6563 return;
6564 }
6565 case CaseCluster::Range: {
6566 if (Case.isUnitRange()) {
6567 // Single item
6568 if (!DoneCmp) {
6569 Constant *Value = Ctx->getConstantInt32(Case.getLow());
6570 _cmp(Comparison, Value);
6571 }
6572 _br(CondX86::Br_e, Case.getTarget());
6573 } else if (DoneCmp && Case.isPairRange()) {
6574 // Range of two items with first item aleady compared against
6575 _br(CondX86::Br_e, Case.getTarget());
6576 Constant *Value = Ctx->getConstantInt32(Case.getHigh());
6577 _cmp(Comparison, Value);
6578 _br(CondX86::Br_e, Case.getTarget());
6579 } else {
6580 // Range
6581 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
6582 _br(CondX86::Br_be, Case.getTarget());
6583 }
6584 if (DefaultTarget != nullptr)
6585 _br(DefaultTarget);
6586 return;
6587 }
6588 }
6589 }
6590
lowerSwitch(const InstSwitch * Instr)6591 void TargetX8632::lowerSwitch(const InstSwitch *Instr) {
6592 // Group cases together and navigate through them with a binary search
6593 CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
6594 Operand *Src0 = Instr->getComparison();
6595 CfgNode *DefaultTarget = Instr->getLabelDefault();
6596
6597 assert(CaseClusters.size() != 0); // Should always be at least one
6598
6599 if (Src0->getType() == IceType_i64) {
6600 Src0 = legalize(Src0); // get Base/Index into physical registers
6601 Operand *Src0Lo = loOperand(Src0);
6602 Operand *Src0Hi = hiOperand(Src0);
6603 if (CaseClusters.back().getHigh() > UINT32_MAX) {
6604 // TODO(ascull): handle 64-bit case properly (currently naive version)
6605 // This might be handled by a higher level lowering of switches.
6606 SizeT NumCases = Instr->getNumCases();
6607 if (NumCases >= 2) {
6608 Src0Lo = legalizeToReg(Src0Lo);
6609 Src0Hi = legalizeToReg(Src0Hi);
6610 } else {
6611 Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
6612 Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
6613 }
6614 for (SizeT I = 0; I < NumCases; ++I) {
6615 Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
6616 Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
6617 InstX86Label *Label = InstX86Label::create(Func, this);
6618 _cmp(Src0Lo, ValueLo);
6619 _br(CondX86::Br_ne, Label);
6620 _cmp(Src0Hi, ValueHi);
6621 _br(CondX86::Br_e, Instr->getLabel(I));
6622 Context.insert(Label);
6623 }
6624 _br(Instr->getLabelDefault());
6625 return;
6626 } else {
6627 // All the values are 32-bit so just check the operand is too and then
6628 // fall through to the 32-bit implementation. This is a common case.
6629 Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
6630 Constant *Zero = Ctx->getConstantInt32(0);
6631 _cmp(Src0Hi, Zero);
6632 _br(CondX86::Br_ne, DefaultTarget);
6633 Src0 = Src0Lo;
6634 }
6635 }
6636
6637 // 32-bit lowering
6638
6639 if (CaseClusters.size() == 1) {
6640 // Jump straight to default if needed. Currently a common case as jump
6641 // tables occur on their own.
6642 constexpr bool DoneCmp = false;
6643 lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
6644 return;
6645 }
6646
6647 // Going to be using multiple times so get it in a register early
6648 Variable *Comparison = legalizeToReg(Src0);
6649
6650 // A span is over the clusters
6651 struct SearchSpan {
6652 SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
6653 : Begin(Begin), Size(Size), Label(Label) {}
6654
6655 SizeT Begin;
6656 SizeT Size;
6657 InstX86Label *Label;
6658 };
6659 // The stack will only grow to the height of the tree so 12 should be plenty
6660 std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
6661 SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
6662 bool DoneCmp = false;
6663
6664 while (!SearchSpanStack.empty()) {
6665 SearchSpan Span = SearchSpanStack.top();
6666 SearchSpanStack.pop();
6667
6668 if (Span.Label != nullptr)
6669 Context.insert(Span.Label);
6670
6671 switch (Span.Size) {
6672 case 0:
6673 llvm::report_fatal_error("Invalid SearchSpan size");
6674 break;
6675
6676 case 1:
6677 lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
6678 SearchSpanStack.empty() ? nullptr : DefaultTarget);
6679 DoneCmp = false;
6680 break;
6681
6682 case 2: {
6683 const CaseCluster *CaseA = &CaseClusters[Span.Begin];
6684 const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
6685
6686 // Placing a range last may allow register clobbering during the range
6687 // test. That means there is no need to clone the register. If it is a
6688 // unit range the comparison may have already been done in the binary
6689 // search (DoneCmp) and so it should be placed first. If this is a range
6690 // of two items and the comparison with the low value has already been
6691 // done, comparing with the other element is cheaper than a range test.
6692 // If the low end of the range is zero then there is no subtraction and
6693 // nothing to be gained.
6694 if (!CaseA->isUnitRange() &&
6695 !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
6696 std::swap(CaseA, CaseB);
6697 DoneCmp = false;
6698 }
6699
6700 lowerCaseCluster(*CaseA, Comparison, DoneCmp);
6701 DoneCmp = false;
6702 lowerCaseCluster(*CaseB, Comparison, DoneCmp,
6703 SearchSpanStack.empty() ? nullptr : DefaultTarget);
6704 } break;
6705
6706 default:
6707 // Pick the middle item and branch b or ae
6708 SizeT PivotIndex = Span.Begin + (Span.Size / 2);
6709 const CaseCluster &Pivot = CaseClusters[PivotIndex];
6710 Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
6711 InstX86Label *Label = InstX86Label::create(Func, this);
6712 _cmp(Comparison, Value);
6713 // TODO(ascull): does it alway have to be far?
6714 _br(CondX86::Br_b, Label, InstX86Br::Far);
6715 // Lower the left and (pivot+right) sides, falling through to the right
6716 SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
6717 SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
6718 DoneCmp = true;
6719 break;
6720 }
6721 }
6722
6723 _br(DefaultTarget);
6724 }
6725
6726 /// The following pattern occurs often in lowered C and C++ code:
6727 ///
6728 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1
6729 /// %cmp.ext = sext <n x i1> %cmp to <n x ty>
6730 ///
6731 /// We can eliminate the sext operation by copying the result of pcmpeqd,
6732 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of
6733 /// the sext operation.
6734
eliminateNextVectorSextInstruction(Variable * SignExtendedResult)6735 void TargetX8632::eliminateNextVectorSextInstruction(
6736 Variable *SignExtendedResult) {
6737 if (auto *NextCast =
6738 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
6739 if (NextCast->getCastKind() == InstCast::Sext &&
6740 NextCast->getSrc(0) == SignExtendedResult) {
6741 NextCast->setDeleted();
6742 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
6743 // Skip over the instruction.
6744 Context.advanceNext();
6745 }
6746 }
6747 }
6748
lowerUnreachable(const InstUnreachable *)6749 void TargetX8632::lowerUnreachable(const InstUnreachable * /*Instr*/) {
6750 _ud2();
6751 // Add a fake use of esp to make sure esp adjustments after the unreachable
6752 // do not get dead-code eliminated.
6753 keepEspLiveAtExit();
6754 }
6755
lowerBreakpoint(const InstBreakpoint *)6756 void TargetX8632::lowerBreakpoint(const InstBreakpoint * /*Instr*/) { _int3(); }
6757
lowerRMW(const InstX86FakeRMW * RMW)6758 void TargetX8632::lowerRMW(const InstX86FakeRMW *RMW) {
6759 // If the beacon variable's live range does not end in this instruction,
6760 // then it must end in the modified Store instruction that follows. This
6761 // means that the original Store instruction is still there, either because
6762 // the value being stored is used beyond the Store instruction, or because
6763 // dead code elimination did not happen. In either case, we cancel RMW
6764 // lowering (and the caller deletes the RMW instruction).
6765 if (!RMW->isLastUse(RMW->getBeacon()))
6766 return;
6767 Operand *Src = RMW->getData();
6768 Type Ty = Src->getType();
6769 X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
6770 doMockBoundsCheck(Addr);
6771 if (Ty == IceType_i64) {
6772 Src = legalizeUndef(Src);
6773 Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
6774 Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
6775 auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
6776 auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
6777 switch (RMW->getOp()) {
6778 default:
6779 // TODO(stichnot): Implement other arithmetic operators.
6780 break;
6781 case InstArithmetic::Add:
6782 _add_rmw(AddrLo, SrcLo);
6783 _adc_rmw(AddrHi, SrcHi);
6784 return;
6785 case InstArithmetic::Sub:
6786 _sub_rmw(AddrLo, SrcLo);
6787 _sbb_rmw(AddrHi, SrcHi);
6788 return;
6789 case InstArithmetic::And:
6790 _and_rmw(AddrLo, SrcLo);
6791 _and_rmw(AddrHi, SrcHi);
6792 return;
6793 case InstArithmetic::Or:
6794 _or_rmw(AddrLo, SrcLo);
6795 _or_rmw(AddrHi, SrcHi);
6796 return;
6797 case InstArithmetic::Xor:
6798 _xor_rmw(AddrLo, SrcLo);
6799 _xor_rmw(AddrHi, SrcHi);
6800 return;
6801 }
6802 } else {
6803 // x86-32: i8, i16, i32
6804 // x86-64: i8, i16, i32, i64
6805 switch (RMW->getOp()) {
6806 default:
6807 // TODO(stichnot): Implement other arithmetic operators.
6808 break;
6809 case InstArithmetic::Add:
6810 Src = legalize(Src, Legal_Reg | Legal_Imm);
6811 _add_rmw(Addr, Src);
6812 return;
6813 case InstArithmetic::Sub:
6814 Src = legalize(Src, Legal_Reg | Legal_Imm);
6815 _sub_rmw(Addr, Src);
6816 return;
6817 case InstArithmetic::And:
6818 Src = legalize(Src, Legal_Reg | Legal_Imm);
6819 _and_rmw(Addr, Src);
6820 return;
6821 case InstArithmetic::Or:
6822 Src = legalize(Src, Legal_Reg | Legal_Imm);
6823 _or_rmw(Addr, Src);
6824 return;
6825 case InstArithmetic::Xor:
6826 Src = legalize(Src, Legal_Reg | Legal_Imm);
6827 _xor_rmw(Addr, Src);
6828 return;
6829 }
6830 }
6831 llvm::report_fatal_error("Couldn't lower RMW instruction");
6832 }
6833
lowerOther(const Inst * Instr)6834 void TargetX8632::lowerOther(const Inst *Instr) {
6835 if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
6836 lowerRMW(RMW);
6837 } else {
6838 TargetLowering::lowerOther(Instr);
6839 }
6840 }
6841
6842 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
6843 /// preserve integrity of liveness analysis. Undef values are also turned into
6844 /// zeroes, since loOperand() and hiOperand() don't expect Undef input.
prelowerPhis()6845 void TargetX8632::prelowerPhis() {
6846 PhiLowering::prelowerPhis32Bit<TargetX8632>(this, Context.getNode(), Func);
6847 }
6848
genTargetHelperCallFor(Inst * Instr)6849 void TargetX8632::genTargetHelperCallFor(Inst *Instr) {
6850 uint32_t StackArgumentsSize = 0;
6851 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
6852 RuntimeHelper HelperID = RuntimeHelper::H_Num;
6853 Variable *Dest = Arith->getDest();
6854 Type DestTy = Dest->getType();
6855 if (DestTy == IceType_i64) {
6856 switch (Arith->getOp()) {
6857 default:
6858 return;
6859 case InstArithmetic::Udiv:
6860 HelperID = RuntimeHelper::H_udiv_i64;
6861 break;
6862 case InstArithmetic::Sdiv:
6863 HelperID = RuntimeHelper::H_sdiv_i64;
6864 break;
6865 case InstArithmetic::Urem:
6866 HelperID = RuntimeHelper::H_urem_i64;
6867 break;
6868 case InstArithmetic::Srem:
6869 HelperID = RuntimeHelper::H_srem_i64;
6870 break;
6871 }
6872 } else if (isVectorType(DestTy)) {
6873 Variable *Dest = Arith->getDest();
6874 Operand *Src0 = Arith->getSrc(0);
6875 Operand *Src1 = Arith->getSrc(1);
6876 switch (Arith->getOp()) {
6877 default:
6878 return;
6879 case InstArithmetic::Mul:
6880 if (DestTy == IceType_v16i8) {
6881 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6882 Arith->setDeleted();
6883 }
6884 return;
6885 case InstArithmetic::Shl:
6886 case InstArithmetic::Lshr:
6887 case InstArithmetic::Ashr:
6888 if (llvm::isa<Constant>(Src1)) {
6889 return;
6890 }
6891 case InstArithmetic::Udiv:
6892 case InstArithmetic::Urem:
6893 case InstArithmetic::Sdiv:
6894 case InstArithmetic::Srem:
6895 case InstArithmetic::Frem:
6896 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6897 Arith->setDeleted();
6898 return;
6899 }
6900 } else {
6901 switch (Arith->getOp()) {
6902 default:
6903 return;
6904 case InstArithmetic::Frem:
6905 if (isFloat32Asserting32Or64(DestTy))
6906 HelperID = RuntimeHelper::H_frem_f32;
6907 else
6908 HelperID = RuntimeHelper::H_frem_f64;
6909 }
6910 }
6911 constexpr SizeT MaxSrcs = 2;
6912 InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
6913 Call->addArg(Arith->getSrc(0));
6914 Call->addArg(Arith->getSrc(1));
6915 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6916 Context.insert(Call);
6917 Arith->setDeleted();
6918 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
6919 InstCast::OpKind CastKind = Cast->getCastKind();
6920 Operand *Src0 = Cast->getSrc(0);
6921 const Type SrcType = Src0->getType();
6922 Variable *Dest = Cast->getDest();
6923 const Type DestTy = Dest->getType();
6924 RuntimeHelper HelperID = RuntimeHelper::H_Num;
6925 Variable *CallDest = Dest;
6926 switch (CastKind) {
6927 default:
6928 return;
6929 case InstCast::Fptosi:
6930 if (DestTy == IceType_i64) {
6931 HelperID = isFloat32Asserting32Or64(SrcType)
6932 ? RuntimeHelper::H_fptosi_f32_i64
6933 : RuntimeHelper::H_fptosi_f64_i64;
6934 } else {
6935 return;
6936 }
6937 break;
6938 case InstCast::Fptoui:
6939 if (isVectorType(DestTy)) {
6940 assert(DestTy == IceType_v4i32);
6941 assert(SrcType == IceType_v4f32);
6942 HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
6943 } else if (DestTy == IceType_i64 || DestTy == IceType_i32) {
6944 if (isInt32Asserting32Or64(DestTy)) {
6945 HelperID = isFloat32Asserting32Or64(SrcType)
6946 ? RuntimeHelper::H_fptoui_f32_i32
6947 : RuntimeHelper::H_fptoui_f64_i32;
6948 } else {
6949 HelperID = isFloat32Asserting32Or64(SrcType)
6950 ? RuntimeHelper::H_fptoui_f32_i64
6951 : RuntimeHelper::H_fptoui_f64_i64;
6952 }
6953 } else {
6954 return;
6955 }
6956 break;
6957 case InstCast::Sitofp:
6958 if (SrcType == IceType_i64) {
6959 HelperID = isFloat32Asserting32Or64(DestTy)
6960 ? RuntimeHelper::H_sitofp_i64_f32
6961 : RuntimeHelper::H_sitofp_i64_f64;
6962 } else {
6963 return;
6964 }
6965 break;
6966 case InstCast::Uitofp:
6967 if (isVectorType(SrcType)) {
6968 assert(DestTy == IceType_v4f32);
6969 assert(SrcType == IceType_v4i32);
6970 HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
6971 } else if (SrcType == IceType_i64 || SrcType == IceType_i32) {
6972 if (isInt32Asserting32Or64(SrcType)) {
6973 HelperID = isFloat32Asserting32Or64(DestTy)
6974 ? RuntimeHelper::H_uitofp_i32_f32
6975 : RuntimeHelper::H_uitofp_i32_f64;
6976 } else {
6977 HelperID = isFloat32Asserting32Or64(DestTy)
6978 ? RuntimeHelper::H_uitofp_i64_f32
6979 : RuntimeHelper::H_uitofp_i64_f64;
6980 }
6981 } else {
6982 return;
6983 }
6984 break;
6985 case InstCast::Bitcast: {
6986 if (DestTy == Src0->getType())
6987 return;
6988 switch (DestTy) {
6989 default:
6990 return;
6991 case IceType_i8:
6992 assert(Src0->getType() == IceType_v8i1);
6993 HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
6994 CallDest = Func->makeVariable(IceType_i32);
6995 break;
6996 case IceType_i16:
6997 assert(Src0->getType() == IceType_v16i1);
6998 HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
6999 CallDest = Func->makeVariable(IceType_i32);
7000 break;
7001 case IceType_v8i1: {
7002 assert(Src0->getType() == IceType_i8);
7003 HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
7004 Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7005 // Arguments to functions are required to be at least 32 bits wide.
7006 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7007 Src0 = Src0AsI32;
7008 } break;
7009 case IceType_v16i1: {
7010 assert(Src0->getType() == IceType_i16);
7011 HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
7012 Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7013 // Arguments to functions are required to be at least 32 bits wide.
7014 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7015 Src0 = Src0AsI32;
7016 } break;
7017 }
7018 } break;
7019 }
7020 constexpr SizeT MaxSrcs = 1;
7021 InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
7022 Call->addArg(Src0);
7023 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7024 Context.insert(Call);
7025 // The PNaCl ABI disallows i8/i16 return types, so truncate the helper
7026 // call result to the appropriate type as necessary.
7027 if (CallDest->getType() != Dest->getType())
7028 Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
7029 Cast->setDeleted();
7030 } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
7031 CfgVector<Type> ArgTypes;
7032 Type ReturnType = IceType_void;
7033 switch (Intrinsic->getIntrinsicID()) {
7034 default:
7035 return;
7036 case Intrinsics::Ctpop: {
7037 Operand *Val = Intrinsic->getArg(0);
7038 Type ValTy = Val->getType();
7039 if (ValTy == IceType_i64)
7040 ArgTypes = {IceType_i64};
7041 else
7042 ArgTypes = {IceType_i32};
7043 ReturnType = IceType_i32;
7044 } break;
7045 case Intrinsics::Longjmp:
7046 ArgTypes = {IceType_i32, IceType_i32};
7047 ReturnType = IceType_void;
7048 break;
7049 case Intrinsics::Memcpy:
7050 ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7051 ReturnType = IceType_void;
7052 break;
7053 case Intrinsics::Memmove:
7054 ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7055 ReturnType = IceType_void;
7056 break;
7057 case Intrinsics::Memset:
7058 ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7059 ReturnType = IceType_void;
7060 break;
7061 case Intrinsics::Setjmp:
7062 ArgTypes = {IceType_i32};
7063 ReturnType = IceType_i32;
7064 break;
7065 }
7066 StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7067 } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
7068 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7069 } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
7070 if (!Ret->hasRetValue())
7071 return;
7072 Operand *RetValue = Ret->getRetValue();
7073 Type ReturnType = RetValue->getType();
7074 if (!isScalarFloatingType(ReturnType))
7075 return;
7076 StackArgumentsSize = typeWidthInBytes(ReturnType);
7077 } else {
7078 return;
7079 }
7080 StackArgumentsSize = applyStackAlignment(StackArgumentsSize);
7081 updateMaxOutArgsSizeBytes(StackArgumentsSize);
7082 }
7083
7084 uint32_t
getCallStackArgumentsSizeBytes(const CfgVector<Type> & ArgTypes,Type ReturnType)7085 TargetX8632::getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
7086 Type ReturnType) {
7087 uint32_t OutArgumentsSizeBytes = 0;
7088 uint32_t XmmArgCount = 0;
7089 uint32_t GprArgCount = 0;
7090 for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
7091 Type Ty = ArgTypes[i];
7092 // The PNaCl ABI requires the width of arguments to be at least 32 bits.
7093 assert(typeWidthInBytes(Ty) >= 4);
7094 if (isVectorType(Ty) &&
7095 RegX8632::getRegisterForXmmArgNum(RegX8632::getArgIndex(i, XmmArgCount))
7096 .hasValue()) {
7097 ++XmmArgCount;
7098 } else if (isScalarIntegerType(Ty) &&
7099 RegX8632::getRegisterForGprArgNum(
7100 Ty, RegX8632::getArgIndex(i, GprArgCount))
7101 .hasValue()) {
7102 // The 64 bit ABI allows some integers to be passed in GPRs.
7103 ++GprArgCount;
7104 } else {
7105 if (isVectorType(Ty)) {
7106 OutArgumentsSizeBytes = applyStackAlignment(OutArgumentsSizeBytes);
7107 }
7108 OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
7109 }
7110 }
7111 // The 32 bit ABI requires floating point values to be returned on the x87
7112 // FP stack. Ensure there is enough space for the fstp/movs for floating
7113 // returns.
7114 if (isScalarFloatingType(ReturnType)) {
7115 OutArgumentsSizeBytes =
7116 std::max(OutArgumentsSizeBytes,
7117 static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
7118 }
7119 return OutArgumentsSizeBytes;
7120 }
7121
getCallStackArgumentsSizeBytes(const InstCall * Instr)7122 uint32_t TargetX8632::getCallStackArgumentsSizeBytes(const InstCall *Instr) {
7123 // Build a vector of the arguments' types.
7124 const SizeT NumArgs = Instr->getNumArgs();
7125 CfgVector<Type> ArgTypes;
7126 ArgTypes.reserve(NumArgs);
7127 for (SizeT i = 0; i < NumArgs; ++i) {
7128 Operand *Arg = Instr->getArg(i);
7129 ArgTypes.emplace_back(Arg->getType());
7130 }
7131 // Compute the return type (if any);
7132 Type ReturnType = IceType_void;
7133 Variable *Dest = Instr->getDest();
7134 if (Dest != nullptr)
7135 ReturnType = Dest->getType();
7136 return getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7137 }
7138
makeZeroedRegister(Type Ty,RegNumT RegNum)7139 Variable *TargetX8632::makeZeroedRegister(Type Ty, RegNumT RegNum) {
7140 Variable *Reg = makeReg(Ty, RegNum);
7141 switch (Ty) {
7142 case IceType_i1:
7143 case IceType_i8:
7144 case IceType_i16:
7145 case IceType_i32:
7146 case IceType_i64:
7147 // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
7148 _mov(Reg, Ctx->getConstantZero(Ty));
7149 break;
7150 case IceType_f32:
7151 case IceType_f64:
7152 Context.insert<InstFakeDef>(Reg);
7153 _xorps(Reg, Reg);
7154 break;
7155 default:
7156 // All vector types use the same pxor instruction.
7157 assert(isVectorType(Ty));
7158 Context.insert<InstFakeDef>(Reg);
7159 _pxor(Reg, Reg);
7160 break;
7161 }
7162 return Reg;
7163 }
7164
7165 // There is no support for loading or emitting vector constants, so the vector
7166 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
7167 // initialized with register operations.
7168 //
7169 // TODO(wala): Add limited support for vector constants so that complex
7170 // initialization in registers is unnecessary.
7171
makeVectorOfZeros(Type Ty,RegNumT RegNum)7172 Variable *TargetX8632::makeVectorOfZeros(Type Ty, RegNumT RegNum) {
7173 return makeZeroedRegister(Ty, RegNum);
7174 }
7175
makeVectorOfMinusOnes(Type Ty,RegNumT RegNum)7176 Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, RegNumT RegNum) {
7177 Variable *MinusOnes = makeReg(Ty, RegNum);
7178 // Insert a FakeDef so the live range of MinusOnes is not overestimated.
7179 Context.insert<InstFakeDef>(MinusOnes);
7180 if (Ty == IceType_f64)
7181 // Making a vector of minus ones of type f64 is currently only used for
7182 // the fabs intrinsic. To use the f64 type to create this mask with
7183 // pcmpeqq requires SSE 4.1. Since we're just creating a mask, pcmpeqd
7184 // does the same job and only requires SSE2.
7185 _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
7186 else
7187 _pcmpeq(MinusOnes, MinusOnes);
7188 return MinusOnes;
7189 }
7190
makeVectorOfOnes(Type Ty,RegNumT RegNum)7191 Variable *TargetX8632::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
7192 Variable *Dest = makeVectorOfZeros(Ty, RegNum);
7193 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
7194 _psub(Dest, MinusOne);
7195 return Dest;
7196 }
7197
makeVectorOfHighOrderBits(Type Ty,RegNumT RegNum)7198 Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum) {
7199 assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
7200 Ty == IceType_v16i8);
7201 if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
7202 Variable *Reg = makeVectorOfOnes(Ty, RegNum);
7203 SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
7204 _psll(Reg, Ctx->getConstantInt8(Shift));
7205 return Reg;
7206 } else {
7207 // SSE has no left shift operation for vectors of 8 bit integers.
7208 constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
7209 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
7210 Variable *Reg = makeReg(Ty, RegNum);
7211 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
7212 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
7213 return Reg;
7214 }
7215 }
7216
7217 /// Construct a mask in a register that can be and'ed with a floating-point
7218 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
7219 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
7220 /// ones logically right shifted one bit.
7221 // TODO(stichnot): Fix the wala
7222 // TODO: above, to represent vector constants in memory.
7223
makeVectorOfFabsMask(Type Ty,RegNumT RegNum)7224 Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, RegNumT RegNum) {
7225 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
7226 _psrl(Reg, Ctx->getConstantInt8(1));
7227 return Reg;
7228 }
7229
getMemoryOperandForStackSlot(Type Ty,Variable * Slot,uint32_t Offset)7230 X86OperandMem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
7231 Variable *Slot,
7232 uint32_t Offset) {
7233 // Ensure that Loc is a stack slot.
7234 assert(Slot->mustNotHaveReg());
7235 assert(Slot->getRegNum().hasNoValue());
7236 // Compute the location of Loc in memory.
7237 // TODO(wala,stichnot): lea should not
7238 // be required. The address of the stack slot is known at compile time
7239 // (although not until after addProlog()).
7240 const Type PointerType = IceType_i32;
7241 Variable *Loc = makeReg(PointerType);
7242 _lea(Loc, Slot);
7243 Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
7244 return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
7245 }
7246
7247 /// Lowering helper to copy a scalar integer source operand into some 8-bit
7248 /// GPR. Src is assumed to already be legalized. If the source operand is
7249 /// known to be a memory or immediate operand, a simple mov will suffice. But
7250 /// if the source operand can be a physical register, then it must first be
7251 /// copied into a physical register that is truncable to 8-bit, then truncated
7252 /// into a physical register that can receive a truncation, and finally copied
7253 /// into the result 8-bit register (which in general can be any 8-bit
7254 /// register). For example, moving %ebp into %ah may be accomplished as:
7255 /// movl %ebp, %edx
7256 /// mov_trunc %edx, %dl // this redundant assignment is ultimately elided
7257 /// movb %dl, %ah
7258 /// On the other hand, moving a memory or immediate operand into ah:
7259 /// movb 4(%ebp), %ah
7260 /// movb $my_imm, %ah
7261 ///
7262 /// Note #1. On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
7263 /// encodable, so RegNum=Reg_ah should NOT be given as an argument. Instead,
7264 /// use RegNum=RegNumT() and then let the caller do a separate copy into
7265 /// Reg_ah.
7266 ///
7267 /// Note #2. ConstantRelocatable operands are also put through this process
7268 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
7269 /// but not R_386_8 relocations.
7270 ///
7271 /// Note #3. If Src is a Variable, the result will be an infinite-weight i8
7272 /// Variable with the RCX86_IsTrunc8Rcvr register class. As such, this helper
7273 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid)
7274 /// argument to the pinsrb instruction.
7275
copyToReg8(Operand * Src,RegNumT RegNum)7276 Variable *TargetX8632::copyToReg8(Operand *Src, RegNumT RegNum) {
7277 Type Ty = Src->getType();
7278 assert(isScalarIntegerType(Ty));
7279 assert(Ty != IceType_i1);
7280 Variable *Reg = makeReg(IceType_i8, RegNum);
7281 Reg->setRegClass(RCX86_IsTrunc8Rcvr);
7282 if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
7283 Variable *SrcTruncable = makeReg(Ty);
7284 switch (Ty) {
7285 case IceType_i64:
7286 SrcTruncable->setRegClass(RCX86_Is64To8);
7287 break;
7288 case IceType_i32:
7289 SrcTruncable->setRegClass(RCX86_Is32To8);
7290 break;
7291 case IceType_i16:
7292 SrcTruncable->setRegClass(RCX86_Is16To8);
7293 break;
7294 default:
7295 // i8 - just use default register class
7296 break;
7297 }
7298 Variable *SrcRcvr = makeReg(IceType_i8);
7299 SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
7300 _mov(SrcTruncable, Src);
7301 _mov(SrcRcvr, SrcTruncable);
7302 Src = SrcRcvr;
7303 }
7304 _mov(Reg, Src);
7305 return Reg;
7306 }
7307
7308 /// Helper for legalize() to emit the right code to lower an operand to a
7309 /// register of the appropriate type.
7310
copyToReg(Operand * Src,RegNumT RegNum)7311 Variable *TargetX8632::copyToReg(Operand *Src, RegNumT RegNum) {
7312 Type Ty = Src->getType();
7313 Variable *Reg = makeReg(Ty, RegNum);
7314 if (isVectorType(Ty)) {
7315 _movp(Reg, Src);
7316 } else {
7317 _mov(Reg, Src);
7318 }
7319 return Reg;
7320 }
7321
legalize(Operand * From,LegalMask Allowed,RegNumT RegNum)7322 Operand *TargetX8632::legalize(Operand *From, LegalMask Allowed,
7323 RegNumT RegNum) {
7324 const Type Ty = From->getType();
7325 // Assert that a physical register is allowed. To date, all calls to
7326 // legalize() allow a physical register. If a physical register needs to be
7327 // explicitly disallowed, then new code will need to be written to force a
7328 // spill.
7329 assert(Allowed & Legal_Reg);
7330 // If we're asking for a specific physical register, make sure we're not
7331 // allowing any other operand kinds. (This could be future work, e.g. allow
7332 // the shl shift amount to be either an immediate or in ecx.)
7333 assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
7334
7335 // Substitute with an available infinite-weight variable if possible. Only
7336 // do this when we are not asking for a specific register, and when the
7337 // substitution is not locked to a specific register, and when the types
7338 // match, in order to capture the vast majority of opportunities and avoid
7339 // corner cases in the lowering.
7340 if (RegNum.hasNoValue()) {
7341 if (Variable *Subst = getContext().availabilityGet(From)) {
7342 // At this point we know there is a potential substitution available.
7343 if (Subst->mustHaveReg() && !Subst->hasReg()) {
7344 // At this point we know the substitution will have a register.
7345 if (From->getType() == Subst->getType()) {
7346 // At this point we know the substitution's register is compatible.
7347 return Subst;
7348 }
7349 }
7350 }
7351 }
7352
7353 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
7354 // Before doing anything with a Mem operand, we need to ensure that the
7355 // Base and Index components are in physical registers.
7356 Variable *Base = Mem->getBase();
7357 Variable *Index = Mem->getIndex();
7358 Constant *Offset = Mem->getOffset();
7359 Variable *RegBase = nullptr;
7360 Variable *RegIndex = nullptr;
7361 uint16_t Shift = Mem->getShift();
7362 if (Base) {
7363 RegBase = llvm::cast<Variable>(
7364 legalize(Base, Legal_Reg | Legal_Rematerializable));
7365 }
7366 if (Index) {
7367 // TODO(jpp): perhaps we should only allow Legal_Reg if
7368 // Base->isRematerializable.
7369 RegIndex = llvm::cast<Variable>(
7370 legalize(Index, Legal_Reg | Legal_Rematerializable));
7371 }
7372
7373 if (Base != RegBase || Index != RegIndex) {
7374 Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
7375 Mem->getSegmentRegister());
7376 }
7377
7378 From = Mem;
7379
7380 if (!(Allowed & Legal_Mem)) {
7381 From = copyToReg(From, RegNum);
7382 }
7383 return From;
7384 }
7385
7386 if (auto *Const = llvm::dyn_cast<Constant>(From)) {
7387 if (llvm::isa<ConstantUndef>(Const)) {
7388 From = legalizeUndef(Const, RegNum);
7389 if (isVectorType(Ty))
7390 return From;
7391 Const = llvm::cast<Constant>(From);
7392 }
7393 // There should be no constants of vector type (other than undef).
7394 assert(!isVectorType(Ty));
7395
7396 if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
7397 if (isScalarFloatingType(Ty)) {
7398 // Convert a scalar floating point constant into an explicit memory
7399 // operand.
7400 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
7401 if (Utils::isPositiveZero(ConstFloat->getValue()))
7402 return makeZeroedRegister(Ty, RegNum);
7403 } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
7404 if (Utils::isPositiveZero(ConstDouble->getValue()))
7405 return makeZeroedRegister(Ty, RegNum);
7406 }
7407
7408 auto *CFrom = llvm::cast<Constant>(From);
7409 assert(CFrom->getShouldBePooled());
7410 Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
7411 auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
7412 From = Mem;
7413 }
7414 }
7415
7416 bool NeedsReg = false;
7417 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
7418 // Immediate specifically not allowed.
7419 NeedsReg = true;
7420 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
7421 // On x86, FP constants are lowered to mem operands.
7422 NeedsReg = true;
7423 if (NeedsReg) {
7424 From = copyToReg(From, RegNum);
7425 }
7426 return From;
7427 }
7428
7429 if (auto *Var = llvm::dyn_cast<Variable>(From)) {
7430 // Check if the variable is guaranteed a physical register. This can
7431 // happen either when the variable is pre-colored or when it is assigned
7432 // infinite weight.
7433 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
7434 bool MustRematerialize =
7435 (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
7436 // We need a new physical register for the operand if:
7437 // - Mem is not allowed and Var isn't guaranteed a physical register, or
7438 // - RegNum is required and Var->getRegNum() doesn't match, or
7439 // - Var is a rematerializable variable and rematerializable pass-through
7440 // is
7441 // not allowed (in which case we need a lea instruction).
7442 if (MustRematerialize) {
7443 Variable *NewVar = makeReg(Ty, RegNum);
7444 // Since Var is rematerializable, the offset will be added when the lea
7445 // is emitted.
7446 constexpr Constant *NoOffset = nullptr;
7447 auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
7448 _lea(NewVar, Mem);
7449 From = NewVar;
7450 } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
7451 (RegNum.hasValue() && RegNum != Var->getRegNum())) {
7452 From = copyToReg(From, RegNum);
7453 }
7454 return From;
7455 }
7456
7457 llvm::report_fatal_error("Unhandled operand kind in legalize()");
7458 return From;
7459 }
7460
7461 /// Provide a trivial wrapper to legalize() for this common usage.
7462
legalizeToReg(Operand * From,RegNumT RegNum)7463 Variable *TargetX8632::legalizeToReg(Operand *From, RegNumT RegNum) {
7464 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
7465 }
7466
7467 /// Legalize undef values to concrete values.
7468
legalizeUndef(Operand * From,RegNumT RegNum)7469 Operand *TargetX8632::legalizeUndef(Operand *From, RegNumT RegNum) {
7470 Type Ty = From->getType();
7471 if (llvm::isa<ConstantUndef>(From)) {
7472 // Lower undefs to zero. Another option is to lower undefs to an
7473 // uninitialized register; however, using an uninitialized register
7474 // results in less predictable code.
7475 //
7476 // If in the future the implementation is changed to lower undef values to
7477 // uninitialized registers, a FakeDef will be needed:
7478 // Context.insert<InstFakeDef>(Reg);
7479 // This is in order to ensure that the live range of Reg is not
7480 // overestimated. If the constant being lowered is a 64 bit value, then
7481 // the result should be split and the lo and hi components will need to go
7482 // in uninitialized registers.
7483 if (isVectorType(Ty))
7484 return makeVectorOfZeros(Ty, RegNum);
7485 return Ctx->getConstantZero(Ty);
7486 }
7487 return From;
7488 }
7489
7490 /// For the cmp instruction, if Src1 is an immediate, or known to be a
7491 /// physical register, we can allow Src0 to be a memory operand. Otherwise,
7492 /// Src0 must be copied into a physical register. (Actually, either Src0 or
7493 /// Src1 can be chosen for the physical register, but unfortunately we have to
7494 /// commit to one or the other before register allocation.)
7495
legalizeSrc0ForCmp(Operand * Src0,Operand * Src1)7496 Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
7497 bool IsSrc1ImmOrReg = false;
7498 if (llvm::isa<Constant>(Src1)) {
7499 IsSrc1ImmOrReg = true;
7500 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
7501 if (Var->hasReg())
7502 IsSrc1ImmOrReg = true;
7503 }
7504 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
7505 }
7506
formMemoryOperand(Operand * Opnd,Type Ty,bool DoLegalize)7507 X86OperandMem *TargetX8632::formMemoryOperand(Operand *Opnd, Type Ty,
7508 bool DoLegalize) {
7509 auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
7510 // It may be the case that address mode optimization already creates an
7511 // X86OperandMem, so in that case it wouldn't need another level of
7512 // transformation.
7513 if (!Mem) {
7514 auto *Base = llvm::dyn_cast<Variable>(Opnd);
7515 auto *Offset = llvm::dyn_cast<Constant>(Opnd);
7516 assert(Base || Offset);
7517 if (Offset) {
7518 if (!llvm::isa<ConstantRelocatable>(Offset)) {
7519 if (llvm::isa<ConstantInteger64>(Offset)) {
7520 // Memory operands cannot have 64-bit immediates, so they must be
7521 // legalized into a register only.
7522 Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
7523 Offset = nullptr;
7524 } else {
7525 Offset = llvm::cast<Constant>(legalize(Offset));
7526
7527 assert(llvm::isa<ConstantInteger32>(Offset) ||
7528 llvm::isa<ConstantRelocatable>(Offset));
7529 }
7530 }
7531 }
7532 Mem = X86OperandMem::create(Func, Ty, Base, Offset);
7533 }
7534 return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
7535 }
7536
makeReg(Type Type,RegNumT RegNum)7537 Variable *TargetX8632::makeReg(Type Type, RegNumT RegNum) {
7538 // There aren't any 64-bit integer registers for x86-32.
7539 assert(Type != IceType_i64);
7540 Variable *Reg = Func->makeVariable(Type);
7541 if (RegNum.hasValue())
7542 Reg->setRegNum(RegNum);
7543 else
7544 Reg->setMustHaveReg();
7545 return Reg;
7546 }
7547
7548 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
7549 IceType_v16i8};
7550
largestTypeInSize(uint32_t Size,uint32_t MaxSize)7551 Type TargetX8632::largestTypeInSize(uint32_t Size, uint32_t MaxSize) {
7552 assert(Size != 0);
7553 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
7554 uint32_t MaxIndex = MaxSize == NoSizeLimit
7555 ? llvm::array_lengthof(TypeForSize) - 1
7556 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
7557 return TypeForSize[std::min(TyIndex, MaxIndex)];
7558 }
7559
firstTypeThatFitsSize(uint32_t Size,uint32_t MaxSize)7560 Type TargetX8632::firstTypeThatFitsSize(uint32_t Size, uint32_t MaxSize) {
7561 assert(Size != 0);
7562 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
7563 if (!llvm::isPowerOf2_32(Size))
7564 ++TyIndex;
7565 uint32_t MaxIndex = MaxSize == NoSizeLimit
7566 ? llvm::array_lengthof(TypeForSize) - 1
7567 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
7568 return TypeForSize[std::min(TyIndex, MaxIndex)];
7569 }
7570
postLower()7571 void TargetX8632::postLower() {
7572 if (Func->getOptLevel() == Opt_m1)
7573 return;
7574 markRedefinitions();
7575 Context.availabilityUpdate();
7576 }
7577
emit(const ConstantInteger32 * C) const7578 void TargetX8632::emit(const ConstantInteger32 *C) const {
7579 if (!BuildDefs::dump())
7580 return;
7581 Ostream &Str = Ctx->getStrEmit();
7582 Str << "$" << C->getValue();
7583 }
7584
emit(const ConstantInteger64 * C) const7585 void TargetX8632::emit(const ConstantInteger64 *C) const {
7586 llvm::report_fatal_error("Not expecting to emit 64-bit integers");
7587 }
7588
emit(const ConstantFloat * C) const7589 void TargetX8632::emit(const ConstantFloat *C) const {
7590 if (!BuildDefs::dump())
7591 return;
7592 Ostream &Str = Ctx->getStrEmit();
7593 Str << C->getLabelName();
7594 }
7595
emit(const ConstantDouble * C) const7596 void TargetX8632::emit(const ConstantDouble *C) const {
7597 if (!BuildDefs::dump())
7598 return;
7599 Ostream &Str = Ctx->getStrEmit();
7600 Str << C->getLabelName();
7601 }
7602
emit(const ConstantUndef *) const7603 void TargetX8632::emit(const ConstantUndef *) const {
7604 llvm::report_fatal_error("undef value encountered by emitter.");
7605 }
7606
emit(const ConstantRelocatable * C) const7607 void TargetX8632::emit(const ConstantRelocatable *C) const {
7608 if (!BuildDefs::dump())
7609 return;
7610 Ostream &Str = Ctx->getStrEmit();
7611 Str << "$";
7612 emitWithoutPrefix(C);
7613 }
7614
emitJumpTable(const Cfg *,const InstJumpTable * JumpTable) const7615 void TargetX8632::emitJumpTable(const Cfg *,
7616 const InstJumpTable *JumpTable) const {
7617 if (!BuildDefs::dump())
7618 return;
7619 Ostream &Str = Ctx->getStrEmit();
7620 Str << "\t.section\t.rodata." << JumpTable->getSectionName()
7621 << ",\"a\",@progbits\n"
7622 "\t.align\t"
7623 << typeWidthInBytes(IceType_i32) << "\n"
7624 << JumpTable->getName() << ":";
7625
7626 for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
7627 Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
7628 Str << "\n";
7629 }
7630
7631 const TargetX8632::TableFcmpType TargetX8632::TableFcmp[] = {
7632 #define X(val, dflt, swapS, C1, C2, swapV, pred) \
7633 {dflt, swapS, CondX86::C1, CondX86::C2, swapV, CondX86::pred},
7634 FCMPX8632_TABLE
7635 #undef X
7636 };
7637
7638 const size_t TargetX8632::TableFcmpSize = llvm::array_lengthof(TableFcmp);
7639
7640 const TargetX8632::TableIcmp32Type TargetX8632::TableIcmp32[] = {
7641 #define X(val, C_32, C1_64, C2_64, C3_64) {CondX86::C_32},
7642 ICMPX8632_TABLE
7643 #undef X
7644 };
7645
7646 const size_t TargetX8632::TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
7647
7648 const TargetX8632::TableIcmp64Type TargetX8632::TableIcmp64[] = {
7649 #define X(val, C_32, C1_64, C2_64, C3_64) \
7650 {CondX86::C1_64, CondX86::C2_64, CondX86::C3_64},
7651 ICMPX8632_TABLE
7652 #undef X
7653 };
7654
7655 const size_t TargetX8632::TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
7656
7657 std::array<SmallBitVector, RCX86_NUM> TargetX8632::TypeToRegisterSet = {{}};
7658
7659 std::array<SmallBitVector, RCX86_NUM> TargetX8632::TypeToRegisterSetUnfiltered =
7660 {{}};
7661
7662 std::array<SmallBitVector, RegisterSet::Reg_NUM> TargetX8632::RegisterAliases =
7663 {{}};
7664
7665 template <typename T>
emitConstantPool(GlobalContext * Ctx)7666 void TargetDataX8632::emitConstantPool(GlobalContext *Ctx) {
7667 if (!BuildDefs::dump())
7668 return;
7669 Ostream &Str = Ctx->getStrEmit();
7670 Type Ty = T::Ty;
7671 SizeT Align = typeAlignInBytes(Ty);
7672 ConstantList Pool = Ctx->getConstantPool(Ty);
7673
7674 Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
7675 << "\n";
7676 Str << "\t.align\t" << Align << "\n";
7677
7678 for (Constant *C : Pool) {
7679 if (!C->getShouldBePooled())
7680 continue;
7681 auto *Const = llvm::cast<typename T::IceType>(C);
7682 typename T::IceType::PrimType Value = Const->getValue();
7683 // Use memcpy() to copy bits from Value into RawValue in a way that avoids
7684 // breaking strict-aliasing rules.
7685 typename T::PrimitiveIntType RawValue;
7686 memcpy(&RawValue, &Value, sizeof(Value));
7687 char buf[30];
7688 int CharsPrinted =
7689 snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
7690 assert(CharsPrinted >= 0);
7691 assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
7692 (void)CharsPrinted; // avoid warnings if asserts are disabled
7693 Str << Const->getLabelName();
7694 Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
7695 << Value << " */\n";
7696 }
7697 }
7698
lowerConstants()7699 void TargetDataX8632::lowerConstants() {
7700 if (getFlags().getDisableTranslation())
7701 return;
7702 switch (getFlags().getOutFileType()) {
7703 case FT_Elf: {
7704 ELFObjectWriter *Writer = Ctx->getObjectWriter();
7705
7706 Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
7707 Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
7708 Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
7709
7710 Writer->writeConstantPool<ConstantFloat>(IceType_f32);
7711 Writer->writeConstantPool<ConstantDouble>(IceType_f64);
7712 } break;
7713 case FT_Asm:
7714 case FT_Iasm: {
7715 OstreamLocker L(Ctx);
7716
7717 emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
7718 emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
7719 emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
7720
7721 emitConstantPool<PoolTypeConverter<float>>(Ctx);
7722 emitConstantPool<PoolTypeConverter<double>>(Ctx);
7723 } break;
7724 }
7725 }
7726
lowerJumpTables()7727 void TargetDataX8632::lowerJumpTables() {
7728 const bool IsPIC = false;
7729 switch (getFlags().getOutFileType()) {
7730 case FT_Elf: {
7731 ELFObjectWriter *Writer = Ctx->getObjectWriter();
7732 const FixupKind RelocationKind = FK_Abs;
7733 for (const JumpTableData &JT : Ctx->getJumpTables())
7734 Writer->writeJumpTable(JT, RelocationKind, IsPIC);
7735 } break;
7736 case FT_Asm:
7737 // Already emitted from Cfg
7738 break;
7739 case FT_Iasm: {
7740 if (!BuildDefs::dump())
7741 return;
7742 Ostream &Str = Ctx->getStrEmit();
7743 const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
7744 for (const JumpTableData &JT : Ctx->getJumpTables()) {
7745 Str << "\t.section\t" << Prefix << JT.getSectionName()
7746 << ",\"a\",@progbits\n"
7747 "\t.align\t"
7748 << typeWidthInBytes(IceType_i32) << "\n"
7749 << JT.getName().toString() << ":";
7750
7751 // On X8664 ILP32 pointers are 32-bit hence the use of .long
7752 for (intptr_t TargetOffset : JT.getTargetOffsets())
7753 Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
7754 Str << "\n";
7755 }
7756 } break;
7757 }
7758 }
7759
lowerGlobals(const VariableDeclarationList & Vars,const std::string & SectionSuffix)7760 void TargetDataX8632::lowerGlobals(const VariableDeclarationList &Vars,
7761 const std::string &SectionSuffix) {
7762 const bool IsPIC = false;
7763 switch (getFlags().getOutFileType()) {
7764 case FT_Elf: {
7765 ELFObjectWriter *Writer = Ctx->getObjectWriter();
7766 Writer->writeDataSection(Vars, FK_Abs, SectionSuffix, IsPIC);
7767 } break;
7768 case FT_Asm:
7769 case FT_Iasm: {
7770 OstreamLocker L(Ctx);
7771 for (const VariableDeclaration *Var : Vars) {
7772 if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
7773 emitGlobal(*Var, SectionSuffix);
7774 }
7775 }
7776 } break;
7777 }
7778 }
7779
7780 //------------------------------------------------------------------------------
7781 // __ ______ __ __ ______ ______ __ __ __ ______
7782 // /\ \ /\ __ \/\ \ _ \ \/\ ___\/\ == \/\ \/\ "-.\ \/\ ___\
7783 // \ \ \___\ \ \/\ \ \ \/ ".\ \ \ __\\ \ __<\ \ \ \ \-. \ \ \__ \
7784 // \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
7785 // \/_____/\/_____/\/_/ \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
7786 //
7787 //------------------------------------------------------------------------------
_add_sp(Operand * Adjustment)7788 void TargetX8632::_add_sp(Operand *Adjustment) {
7789 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7790 _add(esp, Adjustment);
7791 }
7792
_mov_sp(Operand * NewValue)7793 void TargetX8632::_mov_sp(Operand *NewValue) {
7794 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7795 _redefined(_mov(esp, NewValue));
7796 }
7797
_sub_sp(Operand * Adjustment)7798 void TargetX8632::_sub_sp(Operand *Adjustment) {
7799 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7800 _sub(esp, Adjustment);
7801 // Add a fake use of the stack pointer, to prevent the stack pointer
7802 // adustment from being dead-code eliminated in a function that doesn't
7803 // return.
7804 Context.insert<InstFakeUse>(esp);
7805 }
7806
_link_bp()7807 void TargetX8632::_link_bp() {
7808 Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
7809 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7810 _push(ebp);
7811 _mov(ebp, esp);
7812 // Keep ebp live for late-stage liveness analysis (e.g. asm-verbose mode).
7813 Context.insert<InstFakeUse>(ebp);
7814 }
7815
_unlink_bp()7816 void TargetX8632::_unlink_bp() {
7817 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7818 Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
7819 // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
7820 // use of esp before the assignment of esp=ebp keeps previous esp
7821 // adjustments from being dead-code eliminated.
7822 Context.insert<InstFakeUse>(esp);
7823 _mov(esp, ebp);
7824 _pop(ebp);
7825 }
7826
_push_reg(RegNumT RegNum)7827 void TargetX8632::_push_reg(RegNumT RegNum) {
7828 _push(getPhysicalRegister(RegNum, WordType));
7829 }
7830
_pop_reg(RegNumT RegNum)7831 void TargetX8632::_pop_reg(RegNumT RegNum) {
7832 _pop(getPhysicalRegister(RegNum, WordType));
7833 }
7834
7835 /// Lower an indirect jump adding sandboxing when needed.
lowerIndirectJump(Variable * JumpTarget)7836 void TargetX8632::lowerIndirectJump(Variable *JumpTarget) { _jmp(JumpTarget); }
7837
emitCallToTarget(Operand * CallTarget,Variable * ReturnReg,size_t NumVariadicFpArgs)7838 Inst *TargetX8632::emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
7839 size_t NumVariadicFpArgs) {
7840 (void)NumVariadicFpArgs;
7841 // Note that NumVariadicFpArgs is only used for System V x86-64 variadic
7842 // calls, because floating point arguments are passed via vector registers,
7843 // whereas for x86-32, all args are passed via the stack.
7844
7845 return Context.insert<Insts::Call>(ReturnReg, CallTarget);
7846 }
7847
moveReturnValueToRegister(Operand * Value,Type ReturnType)7848 Variable *TargetX8632::moveReturnValueToRegister(Operand *Value,
7849 Type ReturnType) {
7850 if (isVectorType(ReturnType)) {
7851 return legalizeToReg(Value, RegX8632::Reg_xmm0);
7852 } else if (isScalarFloatingType(ReturnType)) {
7853 _fld(Value);
7854 return nullptr;
7855 } else {
7856 assert(ReturnType == IceType_i32 || ReturnType == IceType_i64);
7857 if (ReturnType == IceType_i64) {
7858 Variable *eax = legalizeToReg(loOperand(Value), RegX8632::Reg_eax);
7859 Variable *edx = legalizeToReg(hiOperand(Value), RegX8632::Reg_edx);
7860 Context.insert<InstFakeUse>(edx);
7861 return eax;
7862 } else {
7863 Variable *Reg = nullptr;
7864 _mov(Reg, Value, RegX8632::Reg_eax);
7865 return Reg;
7866 }
7867 }
7868 }
7869
emitStackProbe(size_t StackSizeBytes)7870 void TargetX8632::emitStackProbe(size_t StackSizeBytes) {
7871 #if defined(_WIN32)
7872 if (StackSizeBytes >= 4096) {
7873 // _chkstk on Win32 is actually __alloca_probe, which adjusts ESP by the
7874 // stack amount specified in EAX, so we save ESP in ECX, and restore them
7875 // both after the call.
7876
7877 Variable *EAX = makeReg(IceType_i32, RegX8632::Reg_eax);
7878 Variable *ESP = makeReg(IceType_i32, RegX8632::Reg_esp);
7879 Variable *ECX = makeReg(IceType_i32, RegX8632::Reg_ecx);
7880
7881 _push_reg(ECX->getRegNum());
7882 _mov(ECX, ESP);
7883
7884 _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
7885
7886 auto *CallTarget =
7887 Ctx->getConstantInt32(reinterpret_cast<int32_t>(&_chkstk));
7888 emitCallToTarget(CallTarget, nullptr);
7889
7890 _mov(ESP, ECX);
7891 _pop_reg(ECX->getRegNum());
7892 }
7893 #endif
7894 }
7895
7896 // In some cases, there are x-macros tables for both high-level and low-level
7897 // instructions/operands that use the same enum key value. The tables are kept
7898 // separate to maintain a proper separation between abstraction layers. There
7899 // is a risk that the tables could get out of sync if enum values are
7900 // reordered or if entries are added or deleted. The following dummy
7901 // namespaces use static_asserts to ensure everything is kept in sync.
7902
7903 namespace {
7904 // Validate the enum values in FCMPX8632_TABLE.
7905 namespace dummy1 {
7906 // Define a temporary set of enum values based on low-level table entries.
7907 enum _tmp_enum {
7908 #define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
7909 FCMPX8632_TABLE
7910 #undef X
7911 _num
7912 };
7913 // Define a set of constants based on high-level table entries.
7914 #define X(tag, str) static const int _table1_##tag = InstFcmp::tag;
7915 ICEINSTFCMP_TABLE
7916 #undef X
7917 // Define a set of constants based on low-level table entries, and ensure the
7918 // table entry keys are consistent.
7919 #define X(val, dflt, swapS, C1, C2, swapV, pred) \
7920 static const int _table2_##val = _tmp_##val; \
7921 static_assert( \
7922 _table1_##val == _table2_##val, \
7923 "Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
7924 FCMPX8632_TABLE
7925 #undef X
7926 // Repeat the static asserts with respect to the high-level table entries in
7927 // case the high-level table has extra entries.
7928 #define X(tag, str) \
7929 static_assert( \
7930 _table1_##tag == _table2_##tag, \
7931 "Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
7932 ICEINSTFCMP_TABLE
7933 #undef X
7934 } // end of namespace dummy1
7935
7936 // Validate the enum values in ICMPX8632_TABLE.
7937 namespace dummy2 {
7938 // Define a temporary set of enum values based on low-level table entries.
7939 enum _tmp_enum {
7940 #define X(val, C_32, C1_64, C2_64, C3_64) _tmp_##val,
7941 ICMPX8632_TABLE
7942 #undef X
7943 _num
7944 };
7945 // Define a set of constants based on high-level table entries.
7946 #define X(tag, reverse, str) static const int _table1_##tag = InstIcmp::tag;
7947 ICEINSTICMP_TABLE
7948 #undef X
7949 // Define a set of constants based on low-level table entries, and ensure the
7950 // table entry keys are consistent.
7951 #define X(val, C_32, C1_64, C2_64, C3_64) \
7952 static const int _table2_##val = _tmp_##val; \
7953 static_assert( \
7954 _table1_##val == _table2_##val, \
7955 "Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
7956 ICMPX8632_TABLE
7957 #undef X
7958 // Repeat the static asserts with respect to the high-level table entries in
7959 // case the high-level table has extra entries.
7960 #define X(tag, reverse, str) \
7961 static_assert( \
7962 _table1_##tag == _table2_##tag, \
7963 "Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
7964 ICEINSTICMP_TABLE
7965 #undef X
7966 } // end of namespace dummy2
7967
7968 // Validate the enum values in ICETYPEX86_TABLE.
7969 namespace dummy3 {
7970 // Define a temporary set of enum values based on low-level table entries.
7971 enum _tmp_enum {
7972 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld) \
7973 _tmp_##tag,
7974 ICETYPEX86_TABLE
7975 #undef X
7976 _num
7977 };
7978 // Define a set of constants based on high-level table entries.
7979 #define X(tag, sizeLog2, align, elts, elty, str, rcstr) \
7980 static const int _table1_##tag = IceType_##tag;
7981 ICETYPE_TABLE
7982 #undef X
7983 // Define a set of constants based on low-level table entries, and ensure the
7984 // table entry keys are consistent.
7985 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld) \
7986 static const int _table2_##tag = _tmp_##tag; \
7987 static_assert(_table1_##tag == _table2_##tag, \
7988 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7989 ICETYPEX86_TABLE
7990 #undef X
7991 // Repeat the static asserts with respect to the high-level table entries in
7992 // case the high-level table has extra entries.
7993 #define X(tag, sizeLog2, align, elts, elty, str, rcstr) \
7994 static_assert(_table1_##tag == _table2_##tag, \
7995 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7996 ICETYPE_TABLE
7997 #undef X
7998
7999 } // end of namespace dummy3
8000 } // end of anonymous namespace
8001
8002 } // end of namespace X8632
8003 } // end of namespace Ice
8004