xref: /aosp_15_r20/external/swiftshader/third_party/subzero/src/IceTargetLoweringX8632.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
2 //
3 //                        The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the TargetLoweringX8632 class, which consists almost
12 /// entirely of the lowering sequence for each high-level instruction.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "IceTargetLoweringX8632.h"
17 
18 #include "IceCfg.h"
19 #include "IceCfgNode.h"
20 #include "IceClFlags.h"
21 #include "IceDefs.h"
22 #include "IceELFObjectWriter.h"
23 #include "IceGlobalInits.h"
24 #include "IceInstVarIter.h"
25 #include "IceInstX8632.h"
26 #include "IceLiveness.h"
27 #include "IceOperand.h"
28 #include "IcePhiLoweringImpl.h"
29 #include "IceTargetLoweringX8632.def"
30 #include "IceUtils.h"
31 #include "IceVariableSplitting.h"
32 
33 #include "llvm/Support/MathExtras.h"
34 
35 #include <stack>
36 
37 #if defined(_WIN32)
38 extern "C" void _chkstk();
39 #endif
40 
41 namespace X8632 {
42 
createTargetLowering(::Ice::Cfg * Func)43 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
44   return ::Ice::X8632::TargetX8632::create(Func);
45 }
46 
47 std::unique_ptr<::Ice::TargetDataLowering>
createTargetDataLowering(::Ice::GlobalContext * Ctx)48 createTargetDataLowering(::Ice::GlobalContext *Ctx) {
49   return ::Ice::X8632::TargetDataX8632::create(Ctx);
50 }
51 
52 std::unique_ptr<::Ice::TargetHeaderLowering>
createTargetHeaderLowering(::Ice::GlobalContext * Ctx)53 createTargetHeaderLowering(::Ice::GlobalContext *Ctx) {
54   return ::Ice::X8632::TargetHeaderX86::create(Ctx);
55 }
56 
staticInit(::Ice::GlobalContext * Ctx)57 void staticInit(::Ice::GlobalContext *Ctx) {
58   ::Ice::X8632::TargetX8632::staticInit(Ctx);
59 }
60 
shouldBePooled(const class::Ice::Constant * C)61 bool shouldBePooled(const class ::Ice::Constant *C) {
62   return ::Ice::X8632::TargetX8632::shouldBePooled(C);
63 }
64 
getPointerType()65 ::Ice::Type getPointerType() { return ::Ice::Type::IceType_i32; }
66 
67 } // end of namespace X8632
68 
69 namespace Ice {
70 namespace X8632 {
71 
72 /// The number of bits in a byte
73 static constexpr uint32_t X86_CHAR_BIT = 8;
74 /// Size of the return address on the stack
75 static constexpr uint32_t X86_RET_IP_SIZE_BYTES = 4;
76 
77 /// \name Limits for unrolling memory intrinsics.
78 /// @{
79 static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
80 static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
81 static constexpr uint32_t MEMSET_UNROLL_LIMIT = 8;
82 /// @}
83 
BoolFoldingEntry(Inst * I)84 BoolFoldingEntry::BoolFoldingEntry(Inst *I)
85     : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)) {}
86 
87 BoolFolding::BoolFoldingProducerKind
getProducerKind(const Inst * Instr)88 BoolFolding::getProducerKind(const Inst *Instr) {
89   if (llvm::isa<InstIcmp>(Instr)) {
90     if (Instr->getSrc(0)->getType() != IceType_i64)
91       return PK_Icmp32;
92     return PK_Icmp64;
93   }
94   if (llvm::isa<InstFcmp>(Instr))
95     return PK_Fcmp;
96   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
97     if (Arith->getSrc(0)->getType() != IceType_i64) {
98       switch (Arith->getOp()) {
99       default:
100         return PK_None;
101       case InstArithmetic::And:
102       case InstArithmetic::Or:
103         return PK_Arith;
104       }
105     }
106   }
107   return PK_None; // TODO(stichnot): remove this
108 
109   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
110     switch (Cast->getCastKind()) {
111     default:
112       return PK_None;
113     case InstCast::Trunc:
114       return PK_Trunc;
115     }
116   }
117   return PK_None;
118 }
119 
120 BoolFolding::BoolFoldingConsumerKind
getConsumerKind(const Inst * Instr)121 BoolFolding::getConsumerKind(const Inst *Instr) {
122   if (llvm::isa<InstBr>(Instr))
123     return CK_Br;
124   if (llvm::isa<InstSelect>(Instr))
125     return CK_Select;
126   return CK_None; // TODO(stichnot): remove this
127 
128   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
129     switch (Cast->getCastKind()) {
130     default:
131       return CK_None;
132     case InstCast::Sext:
133       return CK_Sext;
134     case InstCast::Zext:
135       return CK_Zext;
136     }
137   }
138   return CK_None;
139 }
140 
141 /// Returns true if the producing instruction has a "complex" lowering sequence.
142 /// This generally means that its lowering sequence requires more than one
143 /// conditional branch, namely 64-bit integer compares and some floating-point
144 /// compares. When this is true, and there is more than one consumer, we prefer
145 /// to disable the folding optimization because it minimizes branches.
146 
hasComplexLowering(const Inst * Instr)147 bool BoolFolding::hasComplexLowering(const Inst *Instr) {
148   switch (getProducerKind(Instr)) {
149   default:
150     return false;
151   case PK_Icmp64:
152     return true;
153   case PK_Fcmp:
154     return TargetX8632::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()]
155                .C2 != CondX86::Br_None;
156   }
157 }
158 
isValidFolding(BoolFolding::BoolFoldingProducerKind ProducerKind,BoolFolding::BoolFoldingConsumerKind ConsumerKind)159 bool BoolFolding::isValidFolding(
160     BoolFolding::BoolFoldingProducerKind ProducerKind,
161     BoolFolding::BoolFoldingConsumerKind ConsumerKind) {
162   switch (ProducerKind) {
163   default:
164     return false;
165   case PK_Icmp32:
166   case PK_Icmp64:
167   case PK_Fcmp:
168     return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
169   case PK_Arith:
170     return ConsumerKind == CK_Br;
171   }
172 }
173 
init(CfgNode * Node)174 void BoolFolding::init(CfgNode *Node) {
175   Producers.clear();
176   for (Inst &Instr : Node->getInsts()) {
177     if (Instr.isDeleted())
178       continue;
179     invalidateProducersOnStore(&Instr);
180     // Check whether Instr is a valid producer.
181     Variable *Var = Instr.getDest();
182     if (Var) { // only consider instructions with an actual dest var
183       if (isBooleanType(Var->getType())) {        // only bool-type dest vars
184         if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
185           Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
186         }
187       }
188     }
189     // Check each src variable against the map.
190     FOREACH_VAR_IN_INST(Var, Instr) {
191       SizeT VarNum = Var->getIndex();
192       if (!containsValid(VarNum))
193         continue;
194       // All valid consumers use Var as the first source operand
195       if (IndexOfVarOperandInInst(Var) != 0) {
196         setInvalid(VarNum);
197         continue;
198       }
199       // Consumer instructions must be white-listed
200       BoolFolding::BoolFoldingConsumerKind ConsumerKind =
201           getConsumerKind(&Instr);
202       if (ConsumerKind == CK_None) {
203         setInvalid(VarNum);
204         continue;
205       }
206       BoolFolding::BoolFoldingProducerKind ProducerKind =
207           getProducerKind(Producers[VarNum].Instr);
208       if (!isValidFolding(ProducerKind, ConsumerKind)) {
209         setInvalid(VarNum);
210         continue;
211       }
212       // Avoid creating multiple copies of complex producer instructions.
213       if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
214         setInvalid(VarNum);
215         continue;
216       }
217       ++Producers[VarNum].NumUses;
218       if (Instr.isLastUse(Var)) {
219         Producers[VarNum].IsLiveOut = false;
220       }
221     }
222   }
223   for (auto &I : Producers) {
224     // Ignore entries previously marked invalid.
225     if (I.second.Instr == nullptr)
226       continue;
227     // Disable the producer if its dest may be live beyond this block.
228     if (I.second.IsLiveOut) {
229       setInvalid(I.first);
230       continue;
231     }
232     // Mark as "dead" rather than outright deleting. This is so that other
233     // peephole style optimizations during or before lowering have access to
234     // this instruction in undeleted form. See for example
235     // tryOptimizedCmpxchgCmpBr().
236     I.second.Instr->setDead();
237   }
238 }
239 
getProducerFor(const Operand * Opnd) const240 const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
241   auto *Var = llvm::dyn_cast<const Variable>(Opnd);
242   if (Var == nullptr)
243     return nullptr;
244   SizeT VarNum = Var->getIndex();
245   auto Element = Producers.find(VarNum);
246   if (Element == Producers.end())
247     return nullptr;
248   return Element->second.Instr;
249 }
250 
dump(const Cfg * Func) const251 void BoolFolding::dump(const Cfg *Func) const {
252   if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
253     return;
254   OstreamLocker L(Func->getContext());
255   Ostream &Str = Func->getContext()->getStrDump();
256   for (auto &I : Producers) {
257     if (I.second.Instr == nullptr)
258       continue;
259     Str << "Found foldable producer:\n  ";
260     I.second.Instr->dump(Func);
261     Str << "\n";
262   }
263 }
264 
265 /// If the given instruction has potential memory side effects (e.g. store, rmw,
266 /// or a call instruction with potential memory side effects), then we must not
267 /// allow a pre-store Producer instruction with memory operands to be folded
268 /// into a post-store Consumer instruction.  If this is detected, the Producer
269 /// is invalidated.
270 ///
271 /// We use the Producer's IsLiveOut field to determine whether any potential
272 /// Consumers come after this store instruction.  The IsLiveOut field is
273 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
274 /// sees the variable's definitive last use (indicating the variable is not in
275 /// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
276 /// know that there can be no consumers after the store, and therefore we know
277 /// the folding is safe despite the store instruction.
278 
invalidateProducersOnStore(const Inst * Instr)279 void BoolFolding::invalidateProducersOnStore(const Inst *Instr) {
280   if (!Instr->isMemoryWrite())
281     return;
282   for (auto &ProducerPair : Producers) {
283     if (!ProducerPair.second.IsLiveOut)
284       continue;
285     Inst *PInst = ProducerPair.second.Instr;
286     if (PInst == nullptr)
287       continue;
288     bool HasMemOperand = false;
289     const SizeT SrcSize = PInst->getSrcSize();
290     for (SizeT I = 0; I < SrcSize; ++I) {
291       if (llvm::isa<X86OperandMem>(PInst->getSrc(I))) {
292         HasMemOperand = true;
293         break;
294       }
295     }
296     if (!HasMemOperand)
297       continue;
298     setInvalid(ProducerPair.first);
299   }
300 }
301 
initNodeForLowering(CfgNode * Node)302 void TargetX8632::initNodeForLowering(CfgNode *Node) {
303   FoldingInfo.init(Node);
304   FoldingInfo.dump(Func);
305 }
306 
TargetX8632(Cfg * Func)307 TargetX8632::TargetX8632(Cfg *Func) : TargetX86(Func) {}
308 
staticInit(GlobalContext * Ctx)309 void TargetX8632::staticInit(GlobalContext *Ctx) {
310   RegNumT::setLimit(RegX8632::Reg_NUM);
311   RegX8632::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
312   for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
313     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
314   filterTypeToRegisterSet(Ctx, RegX8632::Reg_NUM, TypeToRegisterSet.data(),
315                           TypeToRegisterSet.size(), RegX8632::getRegName,
316                           getRegClassName);
317 }
318 
shouldBePooled(const Constant * C)319 bool TargetX8632::shouldBePooled(const Constant *C) {
320   if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
321     return !Utils::isPositiveZero(ConstFloat->getValue());
322   }
323   if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
324     return !Utils::isPositiveZero(ConstDouble->getValue());
325   }
326   return false;
327 }
328 
getPointerType()329 Type TargetX8632::getPointerType() { return IceType_i32; }
330 
translateO2()331 void TargetX8632::translateO2() {
332   TimerMarker T(TimerStack::TT_O2, Func);
333 
334   genTargetHelperCalls();
335   Func->dump("After target helper call insertion");
336 
337   // Merge Alloca instructions, and lay out the stack.
338   static constexpr bool SortAndCombineAllocas = true;
339   Func->processAllocas(SortAndCombineAllocas);
340   Func->dump("After Alloca processing");
341 
342   // Run this early so it can be used to focus optimizations on potentially hot
343   // code.
344   // TODO(stichnot,ascull): currently only used for regalloc not
345   // expensive high level optimizations which could be focused on potentially
346   // hot code.
347   Func->generateLoopInfo();
348   Func->dump("After loop analysis");
349   if (getFlags().getLoopInvariantCodeMotion()) {
350     Func->loopInvariantCodeMotion();
351     Func->dump("After LICM");
352   }
353 
354   if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
355     Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
356     Func->dump("After Local CSE");
357     Func->floatConstantCSE();
358   }
359   if (getFlags().getEnableShortCircuit()) {
360     Func->shortCircuitJumps();
361     Func->dump("After Short Circuiting");
362   }
363 
364   if (!getFlags().getEnablePhiEdgeSplit()) {
365     // Lower Phi instructions.
366     Func->placePhiLoads();
367     if (Func->hasError())
368       return;
369     Func->placePhiStores();
370     if (Func->hasError())
371       return;
372     Func->deletePhis();
373     if (Func->hasError())
374       return;
375     Func->dump("After Phi lowering");
376   }
377 
378   // Address mode optimization.
379   Func->getVMetadata()->init(VMK_SingleDefs);
380   Func->doAddressOpt();
381   Func->materializeVectorShuffles();
382 
383   // Find read-modify-write opportunities. Do this after address mode
384   // optimization so that doAddressOpt() doesn't need to be applied to RMW
385   // instructions as well.
386   findRMW();
387   Func->dump("After RMW transform");
388 
389   // Argument lowering
390   Func->doArgLowering();
391 
392   // Target lowering. This requires liveness analysis for some parts of the
393   // lowering decisions, such as compare/branch fusing. If non-lightweight
394   // liveness analysis is used, the instructions need to be renumbered first
395   // TODO: This renumbering should only be necessary if we're actually
396   // calculating live intervals, which we only do for register allocation.
397   Func->renumberInstructions();
398   if (Func->hasError())
399     return;
400 
401   // TODO: It should be sufficient to use the fastest liveness calculation,
402   // i.e. livenessLightweight(). However, for some reason that slows down the
403   // rest of the translation. Investigate.
404   Func->liveness(Liveness_Basic);
405   if (Func->hasError())
406     return;
407   Func->dump("After x86 address mode opt");
408 
409   doLoadOpt();
410 
411   Func->genCode();
412   if (Func->hasError())
413     return;
414   Func->dump("After x86 codegen");
415   splitBlockLocalVariables(Func);
416 
417   // Register allocation. This requires instruction renumbering and full
418   // liveness analysis. Loops must be identified before liveness so variable
419   // use weights are correct.
420   Func->renumberInstructions();
421   if (Func->hasError())
422     return;
423   Func->liveness(Liveness_Intervals);
424   if (Func->hasError())
425     return;
426   // The post-codegen dump is done here, after liveness analysis and associated
427   // cleanup, to make the dump cleaner and more useful.
428   Func->dump("After initial x86 codegen");
429   // Validate the live range computations. The expensive validation call is
430   // deliberately only made when assertions are enabled.
431   assert(Func->validateLiveness());
432   Func->getVMetadata()->init(VMK_All);
433   regAlloc(RAK_Global);
434   if (Func->hasError())
435     return;
436   Func->dump("After linear scan regalloc");
437 
438   if (getFlags().getEnablePhiEdgeSplit()) {
439     Func->advancedPhiLowering();
440     Func->dump("After advanced Phi lowering");
441   }
442 
443   // Stack frame mapping.
444   Func->genFrame();
445   if (Func->hasError())
446     return;
447   Func->dump("After stack frame mapping");
448 
449   Func->contractEmptyNodes();
450   Func->reorderNodes();
451 
452   // Branch optimization.  This needs to be done just before code emission. In
453   // particular, no transformations that insert or reorder CfgNodes should be
454   // done after branch optimization. We go ahead and do it before nop insertion
455   // to reduce the amount of work needed for searching for opportunities.
456   Func->doBranchOpt();
457   Func->dump("After branch optimization");
458 }
459 
translateOm1()460 void TargetX8632::translateOm1() {
461   TimerMarker T(TimerStack::TT_Om1, Func);
462 
463   genTargetHelperCalls();
464 
465   // Do not merge Alloca instructions, and lay out the stack.
466   // static constexpr bool SortAndCombineAllocas = false;
467   static constexpr bool SortAndCombineAllocas =
468       true; // TODO(b/171222930): Fix Win32 bug when this is false
469   Func->processAllocas(SortAndCombineAllocas);
470   Func->dump("After Alloca processing");
471 
472   Func->placePhiLoads();
473   if (Func->hasError())
474     return;
475   Func->placePhiStores();
476   if (Func->hasError())
477     return;
478   Func->deletePhis();
479   if (Func->hasError())
480     return;
481   Func->dump("After Phi lowering");
482 
483   Func->doArgLowering();
484   Func->genCode();
485   if (Func->hasError())
486     return;
487   Func->dump("After initial x86 codegen");
488 
489   regAlloc(RAK_InfOnly);
490   if (Func->hasError())
491     return;
492   Func->dump("After regalloc of infinite-weight variables");
493 
494   Func->genFrame();
495   if (Func->hasError())
496     return;
497   Func->dump("After stack frame mapping");
498 }
499 
canRMW(const InstArithmetic * Arith)500 inline bool canRMW(const InstArithmetic *Arith) {
501   Type Ty = Arith->getDest()->getType();
502   // X86 vector instructions write to a register and have no RMW option.
503   if (isVectorType(Ty))
504     return false;
505   bool isI64 = Ty == IceType_i64;
506 
507   switch (Arith->getOp()) {
508   // Not handled for lack of simple lowering:
509   //   shift on i64
510   //   mul, udiv, urem, sdiv, srem, frem
511   // Not handled for lack of RMW instructions:
512   //   fadd, fsub, fmul, fdiv (also vector types)
513   default:
514     return false;
515   case InstArithmetic::Add:
516   case InstArithmetic::Sub:
517   case InstArithmetic::And:
518   case InstArithmetic::Or:
519   case InstArithmetic::Xor:
520     return true;
521   case InstArithmetic::Shl:
522   case InstArithmetic::Lshr:
523   case InstArithmetic::Ashr:
524     return false; // TODO(stichnot): implement
525     return !isI64;
526   }
527 }
528 
isSameMemAddressOperand(const Operand * A,const Operand * B)529 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
530   if (A == B)
531     return true;
532   if (auto *MemA = llvm::dyn_cast<X86OperandMem>(A)) {
533     if (auto *MemB = llvm::dyn_cast<X86OperandMem>(B)) {
534       return MemA->getBase() == MemB->getBase() &&
535              MemA->getOffset() == MemB->getOffset() &&
536              MemA->getIndex() == MemB->getIndex() &&
537              MemA->getShift() == MemB->getShift() &&
538              MemA->getSegmentRegister() == MemB->getSegmentRegister();
539     }
540   }
541   return false;
542 }
543 
findRMW()544 void TargetX8632::findRMW() {
545   TimerMarker _(TimerStack::TT_findRMW, Func);
546   Func->dump("Before RMW");
547   if (Func->isVerbose(IceV_RMW))
548     Func->getContext()->lockStr();
549   for (CfgNode *Node : Func->getNodes()) {
550     // Walk through the instructions, considering each sequence of 3
551     // instructions, and look for the particular RMW pattern. Note that this
552     // search can be "broken" (false negatives) if there are intervening
553     // deleted instructions, or intervening instructions that could be safely
554     // moved out of the way to reveal an RMW pattern.
555     auto E = Node->getInsts().end();
556     auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
557     for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
558       // Make I3 skip over deleted instructions.
559       while (I3 != E && I3->isDeleted())
560         ++I3;
561       if (I1 == E || I2 == E || I3 == E)
562         continue;
563       assert(!I1->isDeleted());
564       assert(!I2->isDeleted());
565       assert(!I3->isDeleted());
566       auto *Load = llvm::dyn_cast<InstLoad>(I1);
567       auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
568       auto *Store = llvm::dyn_cast<InstStore>(I3);
569       if (!Load || !Arith || !Store)
570         continue;
571       // Look for:
572       //   a = Load addr
573       //   b = <op> a, other
574       //   Store b, addr
575       // Change to:
576       //   a = Load addr
577       //   b = <op> a, other
578       //   x = FakeDef
579       //   RMW <op>, addr, other, x
580       //   b = Store b, addr, x
581       // Note that inferTwoAddress() makes sure setDestRedefined() gets called
582       // on the updated Store instruction, to avoid liveness problems later.
583       //
584       // With this transformation, the Store instruction acquires a Dest
585       // variable and is now subject to dead code elimination if there are no
586       // more uses of "b".  Variable "x" is a beacon for determining whether the
587       // Store instruction gets dead-code eliminated.  If the Store instruction
588       // is eliminated, then it must be the case that the RMW instruction ends
589       // x's live range, and therefore the RMW instruction will be retained and
590       // later lowered.  On the other hand, if the RMW instruction does not end
591       // x's live range, then the Store instruction must still be present, and
592       // therefore the RMW instruction is ignored during lowering because it is
593       // redundant with the Store instruction.
594       //
595       // Note that if "a" has further uses, the RMW transformation may still
596       // trigger, resulting in two loads and one store, which is worse than the
597       // original one load and one store.  However, this is probably rare, and
598       // caching probably keeps it just as fast.
599       if (!isSameMemAddressOperand(Load->getLoadAddress(),
600                                    Store->getStoreAddress()))
601         continue;
602       Operand *ArithSrcFromLoad = Arith->getSrc(0);
603       Operand *ArithSrcOther = Arith->getSrc(1);
604       if (ArithSrcFromLoad != Load->getDest()) {
605         if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
606           continue;
607         std::swap(ArithSrcFromLoad, ArithSrcOther);
608       }
609       if (Arith->getDest() != Store->getData())
610         continue;
611       if (!canRMW(Arith))
612         continue;
613       if (Func->isVerbose(IceV_RMW)) {
614         Ostream &Str = Func->getContext()->getStrDump();
615         Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
616         Load->dump(Func);
617         Str << "\n  ";
618         Arith->dump(Func);
619         Str << "\n  ";
620         Store->dump(Func);
621         Str << "\n";
622       }
623       Variable *Beacon = Func->makeVariable(IceType_i32);
624       Beacon->setMustNotHaveReg();
625       Store->setRmwBeacon(Beacon);
626       auto *BeaconDef = InstFakeDef::create(Func, Beacon);
627       Node->getInsts().insert(I3, BeaconDef);
628       auto *RMW =
629           InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
630                                  Beacon, Arith->getOp());
631       Node->getInsts().insert(I3, RMW);
632     }
633   }
634   if (Func->isVerbose(IceV_RMW))
635     Func->getContext()->unlockStr();
636 }
637 
638 /// Value is in bytes. Return Value adjusted to the next highest multiple of
639 /// the stack alignment.
applyStackAlignment(uint32_t Value)640 uint32_t TargetX8632::applyStackAlignment(uint32_t Value) {
641   return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
642 }
643 
644 // Converts a ConstantInteger32 operand into its constant value, or
645 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
getConstantMemoryOrder(Operand * Opnd)646 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
647   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
648     return Integer->getValue();
649   return Intrinsics::MemoryOrderInvalid;
650 }
651 
652 /// Determines whether the dest of a Load instruction can be folded into one of
653 /// the src operands of a 2-operand instruction. This is true as long as the
654 /// load dest matches exactly one of the binary instruction's src operands.
655 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
canFoldLoadIntoBinaryInst(Operand * LoadSrc,Variable * LoadDest,Operand * & Src0,Operand * & Src1)656 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
657                                       Operand *&Src0, Operand *&Src1) {
658   if (Src0 == LoadDest && Src1 != LoadDest) {
659     Src0 = LoadSrc;
660     return true;
661   }
662   if (Src0 != LoadDest && Src1 == LoadDest) {
663     Src1 = LoadSrc;
664     return true;
665   }
666   return false;
667 }
668 
doLoadOpt()669 void TargetX8632::doLoadOpt() {
670   TimerMarker _(TimerStack::TT_loadOpt, Func);
671   for (CfgNode *Node : Func->getNodes()) {
672     Context.init(Node);
673     while (!Context.atEnd()) {
674       Variable *LoadDest = nullptr;
675       Operand *LoadSrc = nullptr;
676       Inst *CurInst = iteratorToInst(Context.getCur());
677       Inst *Next = Context.getNextInst();
678       // Determine whether the current instruction is a Load instruction or
679       // equivalent.
680       if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
681         // An InstLoad qualifies unless it uses a 64-bit absolute address,
682         // which requires legalization to insert a copy to register.
683         // TODO(b/148272103): Fold these after legalization.
684         LoadDest = Load->getDest();
685         constexpr bool DoLegalize = false;
686         LoadSrc = formMemoryOperand(Load->getLoadAddress(), LoadDest->getType(),
687                                     DoLegalize);
688       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
689         // An AtomicLoad intrinsic qualifies as long as it has a valid memory
690         // ordering, and can be implemented in a single instruction (i.e., not
691         // i64 on x86-32).
692         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
693         if (ID == Intrinsics::AtomicLoad &&
694             (Intrin->getDest()->getType() != IceType_i64) &&
695             Intrinsics::isMemoryOrderValid(
696                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
697           LoadDest = Intrin->getDest();
698           constexpr bool DoLegalize = false;
699           LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
700                                       DoLegalize);
701         }
702       }
703       // A Load instruction can be folded into the following instruction only
704       // if the following instruction ends the Load's Dest variable's live
705       // range.
706       if (LoadDest && Next && Next->isLastUse(LoadDest)) {
707         assert(LoadSrc);
708         Inst *NewInst = nullptr;
709         if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
710           Operand *Src0 = Arith->getSrc(0);
711           Operand *Src1 = Arith->getSrc(1);
712           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
713             NewInst = InstArithmetic::create(Func, Arith->getOp(),
714                                              Arith->getDest(), Src0, Src1);
715           }
716         } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
717           Operand *Src0 = Icmp->getSrc(0);
718           Operand *Src1 = Icmp->getSrc(1);
719           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
720             NewInst = InstIcmp::create(Func, Icmp->getCondition(),
721                                        Icmp->getDest(), Src0, Src1);
722           }
723         } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
724           Operand *Src0 = Fcmp->getSrc(0);
725           Operand *Src1 = Fcmp->getSrc(1);
726           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
727             NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
728                                        Fcmp->getDest(), Src0, Src1);
729           }
730         } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
731           Operand *Src0 = Select->getTrueOperand();
732           Operand *Src1 = Select->getFalseOperand();
733           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
734             NewInst = InstSelect::create(Func, Select->getDest(),
735                                          Select->getCondition(), Src0, Src1);
736           }
737         } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
738           // The load dest can always be folded into a Cast instruction.
739           auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
740           if (Src0 == LoadDest) {
741             NewInst = InstCast::create(Func, Cast->getCastKind(),
742                                        Cast->getDest(), LoadSrc);
743           }
744         }
745         if (NewInst) {
746           CurInst->setDeleted();
747           Next->setDeleted();
748           Context.insert(NewInst);
749           // Update NewInst->LiveRangesEnded so that target lowering may
750           // benefit. Also update NewInst->HasSideEffects.
751           NewInst->spliceLivenessInfo(Next, CurInst);
752         }
753       }
754       Context.advanceCur();
755       Context.advanceNext();
756     }
757   }
758   Func->dump("After load optimization");
759 }
760 
doBranchOpt(Inst * I,const CfgNode * NextNode)761 bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
762   if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
763     return Br->optimizeBranch(NextNode);
764   }
765   return false;
766 }
767 
getPhysicalRegister(RegNumT RegNum,Type Ty)768 Variable *TargetX8632::getPhysicalRegister(RegNumT RegNum, Type Ty) {
769   if (Ty == IceType_void)
770     Ty = IceType_i32;
771   if (PhysicalRegisters[Ty].empty())
772     PhysicalRegisters[Ty].resize(RegX8632::Reg_NUM);
773   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
774   Variable *Reg = PhysicalRegisters[Ty][RegNum];
775   if (Reg == nullptr) {
776     Reg = Func->makeVariable(Ty);
777     Reg->setRegNum(RegNum);
778     PhysicalRegisters[Ty][RegNum] = Reg;
779     // Specially mark a named physical register as an "argument" so that it is
780     // considered live upon function entry.  Otherwise it's possible to get
781     // liveness validation errors for saving callee-save registers.
782     Func->addImplicitArg(Reg);
783     // Don't bother tracking the live range of a named physical register.
784     Reg->setIgnoreLiveness();
785   }
786   assert(RegX8632::getGprForType(Ty, RegNum) == RegNum);
787   return Reg;
788 }
789 
getRegName(RegNumT RegNum,Type Ty) const790 const char *TargetX8632::getRegName(RegNumT RegNum, Type Ty) const {
791   return RegX8632::getRegName(RegX8632::getGprForType(Ty, RegNum));
792 }
793 
emitVariable(const Variable * Var) const794 void TargetX8632::emitVariable(const Variable *Var) const {
795   if (!BuildDefs::dump())
796     return;
797   Ostream &Str = Ctx->getStrEmit();
798   if (Var->hasReg()) {
799     Str << "%" << getRegName(Var->getRegNum(), Var->getType());
800     return;
801   }
802   if (Var->mustHaveReg()) {
803     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
804                              ") has no register assigned - function " +
805                              Func->getFunctionName());
806   }
807   const int32_t Offset = Var->getStackOffset();
808   auto BaseRegNum = Var->getBaseRegNum();
809   if (BaseRegNum.hasNoValue())
810     BaseRegNum = getFrameOrStackReg();
811 
812   // Print in the form "Offset(%reg)", omitting Offset when it is 0.
813   if (getFlags().getDecorateAsm()) {
814     Str << Var->getSymbolicStackOffset();
815   } else if (Offset != 0) {
816     Str << Offset;
817   }
818   const Type FrameSPTy = WordType;
819   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
820 }
821 
addProlog(CfgNode * Node)822 void TargetX8632::addProlog(CfgNode *Node) {
823   // Stack frame layout:
824   //
825   // +------------------------+  ^ +
826   // | 1. return address      |  |
827   // +------------------------+  v -
828   // | 2. preserved registers |
829   // +------------------------+ <--- BasePointer (if used)
830   // | 3. padding             |
831   // +------------------------+
832   // | 4. global spill area   |
833   // +------------------------+
834   // | 5. padding             |
835   // +------------------------+
836   // | 6. local spill area    |
837   // +------------------------+
838   // | 7. padding             |
839   // +------------------------+
840   // | 7.5 shadow (WinX64)    |
841   // +------------------------+
842   // | 8. allocas             |
843   // +------------------------+
844   // | 9. padding             |
845   // +------------------------+
846   // | 10. out args           |
847   // +------------------------+ <--- StackPointer
848   //
849   // The following variables record the size in bytes of the given areas:
850   //  * X86_RET_IP_SIZE_BYTES:   area 1
851   //  * PreservedRegsSizeBytes:  area 2
852   //  * SpillAreaPaddingBytes:   area 3
853   //  * GlobalsSize:             area 4
854   //  * LocalsSlotsPaddingBytes: area 5
855   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
856   //  * LocalsSpillAreaSize:     area 6
857   //  * FixedAllocaSizeBytes:    areas 7 - 8
858   //  * SpillAreaSizeBytes:      areas 3 - 10
859   //  * maxOutArgsSizeBytes():   areas 9 - 10
860 
861   // Determine stack frame offsets for each Variable without a register
862   // assignment. This can be done as one variable per stack slot. Or, do
863   // coalescing by running the register allocator again with an infinite set of
864   // registers (as a side effect, this gives variables a second chance at
865   // physical register assignment).
866   //
867   // A middle ground approach is to leverage sparsity and allocate one block of
868   // space on the frame for globals (variables with multi-block lifetime), and
869   // one block to share for locals (single-block lifetime).
870 
871   // StackPointer: points just past return address of calling function
872 
873   Context.init(Node);
874   Context.setInsertPoint(Context.getCur());
875 
876   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
877   RegsUsed = SmallBitVector(CalleeSaves.size());
878   VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
879   size_t GlobalsSize = 0;
880   // If there is a separate locals area, this represents that area. Otherwise
881   // it counts any variable not counted by GlobalsSize.
882   SpillAreaSizeBytes = 0;
883   // If there is a separate locals area, this specifies the alignment for it.
884   uint32_t LocalsSlotsAlignmentBytes = 0;
885   // The entire spill locations area gets aligned to largest natural alignment
886   // of the variables that have a spill slot.
887   uint32_t SpillAreaAlignmentBytes = 0;
888   // A spill slot linked to a variable with a stack slot should reuse that
889   // stack slot.
890   std::function<bool(Variable *)> TargetVarHook =
891       [&VariablesLinkedToSpillSlots](Variable *Var) {
892         // TODO(stichnot): Refactor this into the base class.
893         Variable *Root = Var->getLinkedToStackRoot();
894         if (Root != nullptr) {
895           assert(!Root->hasReg());
896           if (!Root->hasReg()) {
897             VariablesLinkedToSpillSlots.push_back(Var);
898             return true;
899           }
900         }
901         return false;
902       };
903 
904   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
905   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
906                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
907                         &LocalsSlotsAlignmentBytes, TargetVarHook);
908   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
909   SpillAreaSizeBytes += GlobalsSize;
910 
911   // Add push instructions for preserved registers.
912   uint32_t NumCallee = 0;
913   size_t PreservedRegsSizeBytes = 0;
914   SmallBitVector Pushed(CalleeSaves.size());
915   for (RegNumT i : RegNumBVIter(CalleeSaves)) {
916     const auto Canonical = RegX8632::getBaseReg(i);
917     assert(Canonical == RegX8632::getBaseReg(Canonical));
918     if (RegsUsed[i]) {
919       Pushed[Canonical] = true;
920     }
921   }
922   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
923     assert(RegNum == RegX8632::getBaseReg(RegNum));
924     ++NumCallee;
925     if (RegX8632::isXmm(RegNum)) {
926       PreservedRegsSizeBytes += 16;
927     } else {
928       PreservedRegsSizeBytes += typeWidthInBytes(WordType);
929     }
930     _push_reg(RegNum);
931   }
932   Ctx->statsUpdateRegistersSaved(NumCallee);
933 
934   // StackPointer: points past preserved registers at start of spill area
935 
936   // Generate "push frameptr; mov frameptr, stackptr"
937   if (IsEbpBasedFrame) {
938     assert(
939         (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
940         0);
941     PreservedRegsSizeBytes += typeWidthInBytes(WordType);
942     _link_bp();
943   }
944 
945   // Align the variables area. SpillAreaPaddingBytes is the size of the region
946   // after the preserved registers and before the spill areas.
947   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
948   // locals area if they are separate.
949   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
950   uint32_t SpillAreaPaddingBytes = 0;
951   uint32_t LocalsSlotsPaddingBytes = 0;
952   alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
953                        SpillAreaAlignmentBytes, GlobalsSize,
954                        LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
955                        &LocalsSlotsPaddingBytes);
956   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
957   uint32_t GlobalsAndSubsequentPaddingSize =
958       GlobalsSize + LocalsSlotsPaddingBytes;
959 
960   // Functions returning scalar floating point types may need to convert values
961   // from an in-register xmm value to the top of the x87 floating point stack.
962   // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
963   // space on the stack for this.
964   const Type ReturnType = Func->getReturnType();
965   if (isScalarFloatingType(ReturnType)) {
966     // Avoid misaligned double-precision load/store.
967     RequiredStackAlignment =
968         std::max<size_t>(RequiredStackAlignment, X86_STACK_ALIGNMENT_BYTES);
969     SpillAreaSizeBytes =
970         std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
971   }
972 
973   RequiredStackAlignment =
974       std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
975 
976   if (PrologEmitsFixedAllocas) {
977     RequiredStackAlignment =
978         std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
979   }
980 
981   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
982   // fixed allocations in the prolog.
983   if (PrologEmitsFixedAllocas)
984     SpillAreaSizeBytes += FixedAllocaSizeBytes;
985 
986   // Entering the function has made the stack pointer unaligned. Re-align it by
987   // adjusting the stack size.
988   // Note that StackOffset does not include spill area. It's the offset from the
989   // base stack pointer (epb), whether we set it or not, to the the first stack
990   // arg (if any). StackSize, on the other hand, does include the spill area.
991   const uint32_t StackOffset = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
992   uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
993                                              RequiredStackAlignment);
994   StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
995                                     RequiredStackAlignment);
996   SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
997 
998   if (SpillAreaSizeBytes) {
999     auto *Func = Node->getCfg();
1000     if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
1001       Func->setError("Stack size limit exceeded");
1002     }
1003 
1004     emitStackProbe(SpillAreaSizeBytes);
1005 
1006     // Generate "sub stackptr, SpillAreaSizeBytes"
1007     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1008   }
1009 
1010   // StackPointer: points just past the spill area (end of stack frame)
1011 
1012   // If the required alignment is greater than the stack pointer's guaranteed
1013   // alignment, align the stack pointer accordingly.
1014   if (RequiredStackAlignment > X86_STACK_ALIGNMENT_BYTES) {
1015     assert(IsEbpBasedFrame);
1016     _and(getPhysicalRegister(getStackReg(), WordType),
1017          Ctx->getConstantInt32(-RequiredStackAlignment));
1018   }
1019 
1020   // StackPointer: may have just been offset for alignment
1021 
1022   // Account for known-frame-offset alloca instructions that were not already
1023   // combined into the prolog.
1024   if (!PrologEmitsFixedAllocas)
1025     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1026 
1027   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1028 
1029   // Fill in stack offsets for stack args, and copy args into registers for
1030   // those that were register-allocated. Args are pushed right to left, so
1031   // Arg[0] is closest to the stack/frame pointer.
1032   RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
1033   Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, WordType);
1034   size_t BasicFrameOffset = StackOffset;
1035   if (!IsEbpBasedFrame)
1036     BasicFrameOffset += SpillAreaSizeBytes;
1037 
1038   const VarList &Args = Func->getArgs();
1039   size_t InArgsSizeBytes = 0;
1040   unsigned NumXmmArgs = 0;
1041   unsigned NumGPRArgs = 0;
1042   for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
1043     Variable *Arg = Args[i];
1044     // Skip arguments passed in registers.
1045     if (isVectorType(Arg->getType())) {
1046       if (RegX8632::getRegisterForXmmArgNum(
1047               RegX8632::getArgIndex(i, NumXmmArgs))
1048               .hasValue()) {
1049         ++NumXmmArgs;
1050         continue;
1051       }
1052     } else if (!isScalarFloatingType(Arg->getType())) {
1053       assert(isScalarIntegerType(Arg->getType()));
1054       if (RegX8632::getRegisterForGprArgNum(
1055               WordType, RegX8632::getArgIndex(i, NumGPRArgs))
1056               .hasValue()) {
1057         ++NumGPRArgs;
1058         continue;
1059       }
1060     }
1061     // For esp-based frames where the allocas are done outside the prolog, the
1062     // esp value may not stabilize to its home value until after all the
1063     // fixed-size alloca instructions have executed.  In this case, a stack
1064     // adjustment is needed when accessing in-args in order to copy them into
1065     // registers.
1066     size_t StackAdjBytes = 0;
1067     if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1068       StackAdjBytes -= FixedAllocaSizeBytes;
1069     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1070                            InArgsSizeBytes);
1071   }
1072 
1073   // Fill in stack offsets for locals.
1074   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1075                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1076                       IsEbpBasedFrame && !needsStackPointerAlignment());
1077   // Assign stack offsets to variables that have been linked to spilled
1078   // variables.
1079   for (Variable *Var : VariablesLinkedToSpillSlots) {
1080     const Variable *Root = Var->getLinkedToStackRoot();
1081     assert(Root != nullptr);
1082     Var->setStackOffset(Root->getStackOffset());
1083 
1084     // If the stack root variable is an arg, make this variable an arg too so
1085     // that stackVarToAsmAddress uses the correct base pointer (e.g. ebp on
1086     // x86).
1087     Var->setIsArg(Root->getIsArg());
1088   }
1089   this->HasComputedFrame = true;
1090 
1091   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1092     OstreamLocker L(Func->getContext());
1093     Ostream &Str = Func->getContext()->getStrDump();
1094 
1095     Str << "Stack layout:\n";
1096     uint32_t EspAdjustmentPaddingSize =
1097         SpillAreaSizeBytes - LocalsSpillAreaSize -
1098         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1099         maxOutArgsSizeBytes();
1100     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1101         << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
1102         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1103         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1104         << " globals spill area = " << GlobalsSize << " bytes\n"
1105         << " globals-locals spill areas intermediate padding = "
1106         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1107         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1108         << " esp alignment padding = " << EspAdjustmentPaddingSize
1109         << " bytes\n";
1110 
1111     Str << "Stack details:\n"
1112         << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1113         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1114         << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1115         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1116         << " bytes\n"
1117         << " is ebp based = " << IsEbpBasedFrame << "\n";
1118   }
1119 }
1120 
1121 /// Helper function for addProlog().
1122 ///
1123 /// This assumes Arg is an argument passed on the stack. This sets the frame
1124 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1125 /// I64 arg that has been split into Lo and Hi components, it calls itself
1126 /// recursively on the components, taking care to handle Lo first because of the
1127 /// little-endian architecture. Lastly, this function generates an instruction
1128 /// to copy Arg into its assigned register if applicable.
1129 
finishArgumentLowering(Variable * Arg,Variable * FramePtr,size_t BasicFrameOffset,size_t StackAdjBytes,size_t & InArgsSizeBytes)1130 void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
1131                                          size_t BasicFrameOffset,
1132                                          size_t StackAdjBytes,
1133                                          size_t &InArgsSizeBytes) {
1134   if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
1135     Variable *Lo = Arg64On32->getLo();
1136     Variable *Hi = Arg64On32->getHi();
1137     finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
1138                            InArgsSizeBytes);
1139     finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
1140                            InArgsSizeBytes);
1141     return;
1142   }
1143   Type Ty = Arg->getType();
1144   if (isVectorType(Ty)) {
1145     InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
1146   }
1147   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1148   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1149   if (Arg->hasReg()) {
1150     assert(Ty != IceType_i64);
1151     auto *Mem = X86OperandMem::create(
1152         Func, Ty, FramePtr,
1153         Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1154     if (isVectorType(Arg->getType())) {
1155       _movp(Arg, Mem);
1156     } else {
1157       _mov(Arg, Mem);
1158     }
1159     // This argument-copying instruction uses an explicit X86OperandMem
1160     // operand instead of a Variable, so its fill-from-stack operation has to
1161     // be tracked separately for statistics.
1162     Ctx->statsUpdateFills();
1163   }
1164 }
1165 
addEpilog(CfgNode * Node)1166 void TargetX8632::addEpilog(CfgNode *Node) {
1167   InstList &Insts = Node->getInsts();
1168   InstList::reverse_iterator RI, E;
1169   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1170     if (llvm::isa<Insts::Ret>(*RI))
1171       break;
1172   }
1173   if (RI == E)
1174     return;
1175 
1176   // Convert the reverse_iterator position into its corresponding (forward)
1177   // iterator position.
1178   InstList::iterator InsertPoint = reverseToForwardIterator(RI);
1179   --InsertPoint;
1180   Context.init(Node);
1181   Context.setInsertPoint(InsertPoint);
1182 
1183   if (IsEbpBasedFrame) {
1184     _unlink_bp();
1185   } else {
1186     // add stackptr, SpillAreaSizeBytes
1187     if (SpillAreaSizeBytes != 0) {
1188       _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1189     }
1190   }
1191 
1192   // Add pop instructions for preserved registers.
1193   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1194   SmallBitVector Popped(CalleeSaves.size());
1195   for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1196     const auto RegNum = RegNumT::fromInt(i);
1197     if (RegNum == getFrameReg() && IsEbpBasedFrame)
1198       continue;
1199     const RegNumT Canonical = RegX8632::getBaseReg(RegNum);
1200     if (CalleeSaves[i] && RegsUsed[i]) {
1201       Popped[Canonical] = true;
1202     }
1203   }
1204   for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1205     if (!Popped[i])
1206       continue;
1207     const auto RegNum = RegNumT::fromInt(i);
1208     assert(RegNum == RegX8632::getBaseReg(RegNum));
1209     _pop_reg(RegNum);
1210   }
1211 }
1212 
stackSlotType()1213 Type TargetX8632::stackSlotType() { return WordType; }
1214 
loOperand(Operand * Operand)1215 Operand *TargetX8632::loOperand(Operand *Operand) {
1216   assert(Operand->getType() == IceType_i64 ||
1217          Operand->getType() == IceType_f64);
1218   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1219     return Operand;
1220   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1221     return Var64On32->getLo();
1222   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1223     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1224         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
1225     // Check if we need to blind/pool the constant.
1226     return legalize(ConstInt);
1227   }
1228   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1229     auto *MemOperand = X86OperandMem::create(
1230         Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
1231         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1232     // Test if we should randomize or pool the offset, if so randomize it or
1233     // pool it then create mem operand with the blinded/pooled constant.
1234     // Otherwise, return the mem operand as ordinary mem operand.
1235     return legalize(MemOperand);
1236   }
1237   llvm_unreachable("Unsupported operand type");
1238   return nullptr;
1239 }
1240 
hiOperand(Operand * Operand)1241 Operand *TargetX8632::hiOperand(Operand *Operand) {
1242   assert(Operand->getType() == IceType_i64 ||
1243          Operand->getType() == IceType_f64);
1244   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1245     return Operand;
1246   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1247     return Var64On32->getHi();
1248   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1249     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1250         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
1251     // Check if we need to blind/pool the constant.
1252     return legalize(ConstInt);
1253   }
1254   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1255     Constant *Offset = Mem->getOffset();
1256     if (Offset == nullptr) {
1257       Offset = Ctx->getConstantInt32(4);
1258     } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
1259       Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
1260     } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
1261       assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
1262       Offset =
1263           Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
1264     }
1265     auto *MemOperand = X86OperandMem::create(
1266         Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
1267         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1268     // Test if the Offset is an eligible i32 constants for randomization and
1269     // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
1270     // operand.
1271     return legalize(MemOperand);
1272   }
1273   llvm_unreachable("Unsupported operand type");
1274   return nullptr;
1275 }
1276 
getRegisterSet(RegSetMask Include,RegSetMask Exclude) const1277 SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,
1278                                            RegSetMask Exclude) const {
1279   return RegX8632::getRegisterSet(getFlags(), Include, Exclude);
1280 }
1281 
lowerAlloca(const InstAlloca * Instr)1282 void TargetX8632::lowerAlloca(const InstAlloca *Instr) {
1283   // Conservatively require the stack to be aligned. Some stack adjustment
1284   // operations implemented below assume that the stack is aligned before the
1285   // alloca. All the alloca code ensures that the stack alignment is preserved
1286   // after the alloca. The stack alignment restriction can be relaxed in some
1287   // cases.
1288   RequiredStackAlignment =
1289       std::max<size_t>(RequiredStackAlignment, X86_STACK_ALIGNMENT_BYTES);
1290 
1291   // For default align=0, set it to the real value 1, to avoid any
1292   // bit-manipulation problems below.
1293   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1294 
1295   // LLVM enforces power of 2 alignment.
1296   assert(llvm::isPowerOf2_32(AlignmentParam));
1297   assert(llvm::isPowerOf2_32(X86_STACK_ALIGNMENT_BYTES));
1298 
1299   const uint32_t Alignment =
1300       std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
1301   const bool OverAligned = Alignment > X86_STACK_ALIGNMENT_BYTES;
1302   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1303   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1304   const bool UseFramePointer =
1305       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1306 
1307   if (UseFramePointer)
1308     setHasFramePointer();
1309 
1310   Variable *esp = getPhysicalRegister(getStackReg(), WordType);
1311   if (OverAligned) {
1312     _and(esp, Ctx->getConstantInt32(-Alignment));
1313   }
1314 
1315   Variable *Dest = Instr->getDest();
1316   Operand *TotalSize = legalize(Instr->getSizeInBytes());
1317 
1318   if (const auto *ConstantTotalSize =
1319           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1320     const uint32_t Value =
1321         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1322     if (UseFramePointer) {
1323       _sub_sp(Ctx->getConstantInt32(Value));
1324     } else {
1325       // If we don't need a Frame Pointer, this alloca has a known offset to the
1326       // stack pointer. We don't need adjust the stack pointer, nor assign any
1327       // value to Dest, as Dest is rematerializable.
1328       assert(Dest->isRematerializable());
1329       FixedAllocaSizeBytes += Value;
1330       Context.insert<InstFakeDef>(Dest);
1331     }
1332   } else {
1333     // Non-constant sizes need to be adjusted to the next highest multiple of
1334     // the required alignment at runtime.
1335     Variable *T = makeReg(IceType_i32);
1336     _mov(T, TotalSize);
1337     _add(T, Ctx->getConstantInt32(Alignment - 1));
1338     _and(T, Ctx->getConstantInt32(-Alignment));
1339     _sub_sp(T);
1340   }
1341   // Add enough to the returned address to account for the out args area.
1342   uint32_t OutArgsSize = maxOutArgsSizeBytes();
1343   if (OutArgsSize > 0) {
1344     Variable *T = makeReg(Dest->getType());
1345     auto *CalculateOperand = X86OperandMem::create(
1346         Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1347     _lea(T, CalculateOperand);
1348     _mov(Dest, T);
1349   } else {
1350     _mov(Dest, esp);
1351   }
1352 }
1353 
lowerArguments()1354 void TargetX8632::lowerArguments() {
1355   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1356   VarList &Args = Func->getArgs();
1357   unsigned NumXmmArgs = 0;
1358   bool XmmSlotsRemain = true;
1359   unsigned NumGprArgs = 0;
1360   bool GprSlotsRemain = true;
1361 
1362   Context.init(Func->getEntryNode());
1363   Context.setInsertPoint(Context.getCur());
1364 
1365   for (SizeT i = 0, End = Args.size();
1366        i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1367     Variable *Arg = Args[i];
1368     Type Ty = Arg->getType();
1369     Variable *RegisterArg = nullptr;
1370     RegNumT RegNum;
1371     if (isVectorType(Ty)) {
1372       RegNum = RegX8632::getRegisterForXmmArgNum(
1373           RegX8632::getArgIndex(i, NumXmmArgs));
1374       if (RegNum.hasNoValue()) {
1375         XmmSlotsRemain = false;
1376         continue;
1377       }
1378       ++NumXmmArgs;
1379       RegisterArg = Func->makeVariable(Ty);
1380     } else if (isScalarFloatingType(Ty)) {
1381       continue;
1382     } else if (isScalarIntegerType(Ty)) {
1383       RegNum = RegX8632::getRegisterForGprArgNum(
1384           Ty, RegX8632::getArgIndex(i, NumGprArgs));
1385       if (RegNum.hasNoValue()) {
1386         GprSlotsRemain = false;
1387         continue;
1388       }
1389       ++NumGprArgs;
1390       RegisterArg = Func->makeVariable(Ty);
1391     }
1392     assert(RegNum.hasValue());
1393     assert(RegisterArg != nullptr);
1394     // Replace Arg in the argument list with the home register. Then generate
1395     // an instruction in the prolog to copy the home register to the assigned
1396     // location of Arg.
1397     if (BuildDefs::dump())
1398       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1399     RegisterArg->setRegNum(RegNum);
1400     RegisterArg->setIsArg();
1401     Arg->setIsArg(false);
1402 
1403     Args[i] = RegisterArg;
1404     // When not Om1, do the assignment through a temporary, instead of directly
1405     // from the pre-colored variable, so that a subsequent availabilityGet()
1406     // call has a chance to work.  (In Om1, don't bother creating extra
1407     // instructions with extra variables to register-allocate.)
1408     if (OptM1) {
1409       Context.insert<InstAssign>(Arg, RegisterArg);
1410     } else {
1411       Variable *Tmp = makeReg(RegisterArg->getType());
1412       Context.insert<InstAssign>(Tmp, RegisterArg);
1413       Context.insert<InstAssign>(Arg, Tmp);
1414     }
1415   }
1416   if (!OptM1)
1417     Context.availabilityUpdate();
1418 }
1419 
1420 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1421 /// narrower) for certain constants. The lea instruction can be used to multiply
1422 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1423 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1424 /// lea-based multiplies by 5, combined with left-shifting by 2.
1425 
optimizeScalarMul(Variable * Dest,Operand * Src0,int32_t Src1)1426 bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
1427                                     int32_t Src1) {
1428   // Disable this optimization for Om1 and O0, just to keep things simple
1429   // there.
1430   if (Func->getOptLevel() < Opt_1)
1431     return false;
1432   Type Ty = Dest->getType();
1433   if (Src1 == -1) {
1434     Variable *T = nullptr;
1435     _mov(T, Src0);
1436     _neg(T);
1437     _mov(Dest, T);
1438     return true;
1439   }
1440   if (Src1 == 0) {
1441     _mov(Dest, Ctx->getConstantZero(Ty));
1442     return true;
1443   }
1444   if (Src1 == 1) {
1445     Variable *T = nullptr;
1446     _mov(T, Src0);
1447     _mov(Dest, T);
1448     return true;
1449   }
1450   // Don't bother with the edge case where Src1 == MININT.
1451   if (Src1 == -Src1)
1452     return false;
1453   const bool Src1IsNegative = Src1 < 0;
1454   if (Src1IsNegative)
1455     Src1 = -Src1;
1456   uint32_t Count9 = 0;
1457   uint32_t Count5 = 0;
1458   uint32_t Count3 = 0;
1459   uint32_t Count2 = 0;
1460   uint32_t CountOps = 0;
1461   while (Src1 > 1) {
1462     if (Src1 % 9 == 0) {
1463       ++CountOps;
1464       ++Count9;
1465       Src1 /= 9;
1466     } else if (Src1 % 5 == 0) {
1467       ++CountOps;
1468       ++Count5;
1469       Src1 /= 5;
1470     } else if (Src1 % 3 == 0) {
1471       ++CountOps;
1472       ++Count3;
1473       Src1 /= 3;
1474     } else if (Src1 % 2 == 0) {
1475       if (Count2 == 0)
1476         ++CountOps;
1477       ++Count2;
1478       Src1 /= 2;
1479     } else {
1480       return false;
1481     }
1482   }
1483   // Lea optimization only works for i16 and i32 types, not i8.
1484   if (Ty != IceType_i32 && (Count3 || Count5 || Count9))
1485     return false;
1486   // Limit the number of lea/shl operations for a single multiply, to a
1487   // somewhat arbitrary choice of 3.
1488   constexpr uint32_t MaxOpsForOptimizedMul = 3;
1489   if (CountOps > MaxOpsForOptimizedMul)
1490     return false;
1491   Variable *T = makeReg(WordType);
1492   if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1493     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
1494     _movzx(T, Src0RM);
1495   } else {
1496     _mov(T, Src0);
1497   }
1498   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1499   for (uint32_t i = 0; i < Count9; ++i) {
1500     constexpr uint16_t Shift = 3; // log2(9-1)
1501     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1502   }
1503   for (uint32_t i = 0; i < Count5; ++i) {
1504     constexpr uint16_t Shift = 2; // log2(5-1)
1505     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1506   }
1507   for (uint32_t i = 0; i < Count3; ++i) {
1508     constexpr uint16_t Shift = 1; // log2(3-1)
1509     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1510   }
1511   if (Count2) {
1512     _shl(T, Ctx->getConstantInt(Ty, Count2));
1513   }
1514   if (Src1IsNegative)
1515     _neg(T);
1516   _mov(Dest, T);
1517   return true;
1518 }
1519 
lowerShift64(InstArithmetic::OpKind Op,Operand * Src0Lo,Operand * Src0Hi,Operand * Src1Lo,Variable * DestLo,Variable * DestHi)1520 void TargetX8632::lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo,
1521                                Operand *Src0Hi, Operand *Src1Lo,
1522                                Variable *DestLo, Variable *DestHi) {
1523   // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1524   Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1525   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1526   Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1527   if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1528     uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1529     if (ShiftAmount > 32) {
1530       Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1531       switch (Op) {
1532       default:
1533         assert(0 && "non-shift op");
1534         break;
1535       case InstArithmetic::Shl: {
1536         // a=b<<c ==>
1537         //   t2 = b.lo
1538         //   t2 = shl t2, ShiftAmount-32
1539         //   t3 = t2
1540         //   t2 = 0
1541         _mov(T_2, Src0Lo);
1542         _shl(T_2, ReducedShift);
1543         _mov(DestHi, T_2);
1544         _mov(DestLo, Zero);
1545       } break;
1546       case InstArithmetic::Lshr: {
1547         // a=b>>c (unsigned) ==>
1548         //   t2 = b.hi
1549         //   t2 = shr t2, ShiftAmount-32
1550         //   a.lo = t2
1551         //   a.hi = 0
1552         _mov(T_2, Src0Hi);
1553         _shr(T_2, ReducedShift);
1554         _mov(DestLo, T_2);
1555         _mov(DestHi, Zero);
1556       } break;
1557       case InstArithmetic::Ashr: {
1558         // a=b>>c (signed) ==>
1559         //   t3 = b.hi
1560         //   t3 = sar t3, 0x1f
1561         //   t2 = b.hi
1562         //   t2 = shrd t2, t3, ShiftAmount-32
1563         //   a.lo = t2
1564         //   a.hi = t3
1565         _mov(T_3, Src0Hi);
1566         _sar(T_3, SignExtend);
1567         _mov(T_2, Src0Hi);
1568         _shrd(T_2, T_3, ReducedShift);
1569         _mov(DestLo, T_2);
1570         _mov(DestHi, T_3);
1571       } break;
1572       }
1573     } else if (ShiftAmount == 32) {
1574       switch (Op) {
1575       default:
1576         assert(0 && "non-shift op");
1577         break;
1578       case InstArithmetic::Shl: {
1579         // a=b<<c ==>
1580         //   t2 = b.lo
1581         //   a.hi = t2
1582         //   a.lo = 0
1583         _mov(T_2, Src0Lo);
1584         _mov(DestHi, T_2);
1585         _mov(DestLo, Zero);
1586       } break;
1587       case InstArithmetic::Lshr: {
1588         // a=b>>c (unsigned) ==>
1589         //   t2 = b.hi
1590         //   a.lo = t2
1591         //   a.hi = 0
1592         _mov(T_2, Src0Hi);
1593         _mov(DestLo, T_2);
1594         _mov(DestHi, Zero);
1595       } break;
1596       case InstArithmetic::Ashr: {
1597         // a=b>>c (signed) ==>
1598         //   t2 = b.hi
1599         //   a.lo = t2
1600         //   t3 = b.hi
1601         //   t3 = sar t3, 0x1f
1602         //   a.hi = t3
1603         _mov(T_2, Src0Hi);
1604         _mov(DestLo, T_2);
1605         _mov(T_3, Src0Hi);
1606         _sar(T_3, SignExtend);
1607         _mov(DestHi, T_3);
1608       } break;
1609       }
1610     } else {
1611       // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1612       //   t2 = b.lo
1613       //   t3 = b.hi
1614       _mov(T_2, Src0Lo);
1615       _mov(T_3, Src0Hi);
1616       switch (Op) {
1617       default:
1618         assert(0 && "non-shift op");
1619         break;
1620       case InstArithmetic::Shl: {
1621         // a=b<<c ==>
1622         //   t3 = shld t3, t2, ShiftAmount
1623         //   t2 = shl t2, ShiftAmount
1624         _shld(T_3, T_2, ConstantShiftAmount);
1625         _shl(T_2, ConstantShiftAmount);
1626       } break;
1627       case InstArithmetic::Lshr: {
1628         // a=b>>c (unsigned) ==>
1629         //   t2 = shrd t2, t3, ShiftAmount
1630         //   t3 = shr t3, ShiftAmount
1631         _shrd(T_2, T_3, ConstantShiftAmount);
1632         _shr(T_3, ConstantShiftAmount);
1633       } break;
1634       case InstArithmetic::Ashr: {
1635         // a=b>>c (signed) ==>
1636         //   t2 = shrd t2, t3, ShiftAmount
1637         //   t3 = sar t3, ShiftAmount
1638         _shrd(T_2, T_3, ConstantShiftAmount);
1639         _sar(T_3, ConstantShiftAmount);
1640       } break;
1641       }
1642       // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1643       //   a.lo = t2
1644       //   a.hi = t3
1645       _mov(DestLo, T_2);
1646       _mov(DestHi, T_3);
1647     }
1648   } else {
1649     // NON-CONSTANT CASES.
1650     Constant *BitTest = Ctx->getConstantInt32(0x20);
1651     InstX86Label *Label = InstX86Label::create(Func, this);
1652     // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1653     //   t1:ecx = c.lo & 0xff
1654     //   t2 = b.lo
1655     //   t3 = b.hi
1656     T_1 = copyToReg8(Src1Lo, RegX8632::Reg_cl);
1657     _mov(T_2, Src0Lo);
1658     _mov(T_3, Src0Hi);
1659     switch (Op) {
1660     default:
1661       assert(0 && "non-shift op");
1662       break;
1663     case InstArithmetic::Shl: {
1664       // a=b<<c ==>
1665       //   t3 = shld t3, t2, t1
1666       //   t2 = shl t2, t1
1667       //   test t1, 0x20
1668       //   je L1
1669       //   use(t3)
1670       //   t3 = t2
1671       //   t2 = 0
1672       _shld(T_3, T_2, T_1);
1673       _shl(T_2, T_1);
1674       _test(T_1, BitTest);
1675       _br(CondX86::Br_e, Label);
1676       // T_2 and T_3 are being assigned again because of the intra-block control
1677       // flow, so we need to use _redefined to avoid liveness problems.
1678       _redefined(_mov(T_3, T_2));
1679       _redefined(_mov(T_2, Zero));
1680     } break;
1681     case InstArithmetic::Lshr: {
1682       // a=b>>c (unsigned) ==>
1683       //   t2 = shrd t2, t3, t1
1684       //   t3 = shr t3, t1
1685       //   test t1, 0x20
1686       //   je L1
1687       //   use(t2)
1688       //   t2 = t3
1689       //   t3 = 0
1690       _shrd(T_2, T_3, T_1);
1691       _shr(T_3, T_1);
1692       _test(T_1, BitTest);
1693       _br(CondX86::Br_e, Label);
1694       // T_2 and T_3 are being assigned again because of the intra-block control
1695       // flow, so we need to use _redefined to avoid liveness problems.
1696       _redefined(_mov(T_2, T_3));
1697       _redefined(_mov(T_3, Zero));
1698     } break;
1699     case InstArithmetic::Ashr: {
1700       // a=b>>c (signed) ==>
1701       //   t2 = shrd t2, t3, t1
1702       //   t3 = sar t3, t1
1703       //   test t1, 0x20
1704       //   je L1
1705       //   use(t2)
1706       //   t2 = t3
1707       //   t3 = sar t3, 0x1f
1708       Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1709       _shrd(T_2, T_3, T_1);
1710       _sar(T_3, T_1);
1711       _test(T_1, BitTest);
1712       _br(CondX86::Br_e, Label);
1713       // T_2 and T_3 are being assigned again because of the intra-block control
1714       // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1715       // doesn't need special treatment because it is reassigned via _sar
1716       // instead of _mov.
1717       _redefined(_mov(T_2, T_3));
1718       _sar(T_3, SignExtend);
1719     } break;
1720     }
1721     // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1722     // L1:
1723     //   a.lo = t2
1724     //   a.hi = t3
1725     Context.insert(Label);
1726     _mov(DestLo, T_2);
1727     _mov(DestHi, T_3);
1728   }
1729 }
1730 
lowerArithmetic(const InstArithmetic * Instr)1731 void TargetX8632::lowerArithmetic(const InstArithmetic *Instr) {
1732   Variable *Dest = Instr->getDest();
1733   if (Dest->isRematerializable()) {
1734     Context.insert<InstFakeDef>(Dest);
1735     return;
1736   }
1737   Type Ty = Dest->getType();
1738   Operand *Src0 = legalize(Instr->getSrc(0));
1739   Operand *Src1 = legalize(Instr->getSrc(1));
1740   if (Instr->isCommutative()) {
1741     uint32_t SwapCount = 0;
1742     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1743       std::swap(Src0, Src1);
1744       ++SwapCount;
1745     }
1746     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1747       std::swap(Src0, Src1);
1748       ++SwapCount;
1749     }
1750     // Improve two-address code patterns by avoiding a copy to the dest
1751     // register when one of the source operands ends its lifetime here.
1752     if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1753       std::swap(Src0, Src1);
1754       ++SwapCount;
1755     }
1756     assert(SwapCount <= 1);
1757     (void)SwapCount;
1758   }
1759   if (Ty == IceType_i64) {
1760     // These x86-32 helper-call-involved instructions are lowered in this
1761     // separate switch. This is because loOperand() and hiOperand() may insert
1762     // redundant instructions for constant blinding and pooling. Such redundant
1763     // instructions will fail liveness analysis under -Om1 setting. And,
1764     // actually these arguments do not need to be processed with loOperand()
1765     // and hiOperand() to be used.
1766     switch (Instr->getOp()) {
1767     case InstArithmetic::Udiv:
1768     case InstArithmetic::Sdiv:
1769     case InstArithmetic::Urem:
1770     case InstArithmetic::Srem:
1771       llvm::report_fatal_error("Helper call was expected");
1772       return;
1773     default:
1774       break;
1775     }
1776 
1777     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
1778     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
1779     Operand *Src0Lo = loOperand(Src0);
1780     Operand *Src0Hi = hiOperand(Src0);
1781     Operand *Src1Lo = loOperand(Src1);
1782     Operand *Src1Hi = hiOperand(Src1);
1783     Variable *T_Lo = nullptr, *T_Hi = nullptr;
1784     switch (Instr->getOp()) {
1785     case InstArithmetic::_num:
1786       llvm_unreachable("Unknown arithmetic operator");
1787       break;
1788     case InstArithmetic::Add:
1789       _mov(T_Lo, Src0Lo);
1790       _add(T_Lo, Src1Lo);
1791       _mov(DestLo, T_Lo);
1792       _mov(T_Hi, Src0Hi);
1793       _adc(T_Hi, Src1Hi);
1794       _mov(DestHi, T_Hi);
1795       break;
1796     case InstArithmetic::And:
1797       _mov(T_Lo, Src0Lo);
1798       _and(T_Lo, Src1Lo);
1799       _mov(DestLo, T_Lo);
1800       _mov(T_Hi, Src0Hi);
1801       _and(T_Hi, Src1Hi);
1802       _mov(DestHi, T_Hi);
1803       break;
1804     case InstArithmetic::Or:
1805       _mov(T_Lo, Src0Lo);
1806       _or(T_Lo, Src1Lo);
1807       _mov(DestLo, T_Lo);
1808       _mov(T_Hi, Src0Hi);
1809       _or(T_Hi, Src1Hi);
1810       _mov(DestHi, T_Hi);
1811       break;
1812     case InstArithmetic::Xor:
1813       _mov(T_Lo, Src0Lo);
1814       _xor(T_Lo, Src1Lo);
1815       _mov(DestLo, T_Lo);
1816       _mov(T_Hi, Src0Hi);
1817       _xor(T_Hi, Src1Hi);
1818       _mov(DestHi, T_Hi);
1819       break;
1820     case InstArithmetic::Sub:
1821       _mov(T_Lo, Src0Lo);
1822       _sub(T_Lo, Src1Lo);
1823       _mov(DestLo, T_Lo);
1824       _mov(T_Hi, Src0Hi);
1825       _sbb(T_Hi, Src1Hi);
1826       _mov(DestHi, T_Hi);
1827       break;
1828     case InstArithmetic::Mul: {
1829       Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1830       Variable *T_4Lo = makeReg(IceType_i32, RegX8632::Reg_eax);
1831       Variable *T_4Hi = makeReg(IceType_i32, RegX8632::Reg_edx);
1832       // gcc does the following:
1833       // a=b*c ==>
1834       //   t1 = b.hi; t1 *=(imul) c.lo
1835       //   t2 = c.hi; t2 *=(imul) b.lo
1836       //   t3:eax = b.lo
1837       //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
1838       //   a.lo = t4.lo
1839       //   t4.hi += t1
1840       //   t4.hi += t2
1841       //   a.hi = t4.hi
1842       // The mul instruction cannot take an immediate operand.
1843       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
1844       _mov(T_1, Src0Hi);
1845       _imul(T_1, Src1Lo);
1846       _mov(T_3, Src0Lo, RegX8632::Reg_eax);
1847       _mul(T_4Lo, T_3, Src1Lo);
1848       // The mul instruction produces two dest variables, edx:eax. We create a
1849       // fake definition of edx to account for this.
1850       Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
1851       Context.insert<InstFakeUse>(T_4Hi);
1852       _mov(DestLo, T_4Lo);
1853       _add(T_4Hi, T_1);
1854       _mov(T_2, Src1Hi);
1855       Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
1856       _imul(T_2, Src0Lo);
1857       _add(T_4Hi, T_2);
1858       _mov(DestHi, T_4Hi);
1859     } break;
1860     case InstArithmetic::Shl:
1861     case InstArithmetic::Lshr:
1862     case InstArithmetic::Ashr:
1863       lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
1864       break;
1865     case InstArithmetic::Fadd:
1866     case InstArithmetic::Fsub:
1867     case InstArithmetic::Fmul:
1868     case InstArithmetic::Fdiv:
1869     case InstArithmetic::Frem:
1870       llvm_unreachable("FP instruction with i64 type");
1871       break;
1872     case InstArithmetic::Udiv:
1873     case InstArithmetic::Sdiv:
1874     case InstArithmetic::Urem:
1875     case InstArithmetic::Srem:
1876       llvm_unreachable("Call-helper-involved instruction for i64 type \
1877                        should have already been handled before");
1878       break;
1879     }
1880     return;
1881   }
1882   if (isVectorType(Ty)) {
1883     // TODO: Trap on integer divide and integer modulo by zero. See:
1884     // https://code.google.com/p/nativeclient/issues/detail?id=3899
1885     if (llvm::isa<X86OperandMem>(Src1))
1886       Src1 = legalizeToReg(Src1);
1887     switch (Instr->getOp()) {
1888     case InstArithmetic::_num:
1889       llvm_unreachable("Unknown arithmetic operator");
1890       break;
1891     case InstArithmetic::Add: {
1892       Variable *T = makeReg(Ty);
1893       _movp(T, Src0);
1894       _padd(T, Src1);
1895       _movp(Dest, T);
1896     } break;
1897     case InstArithmetic::And: {
1898       Variable *T = makeReg(Ty);
1899       _movp(T, Src0);
1900       _pand(T, Src1);
1901       _movp(Dest, T);
1902     } break;
1903     case InstArithmetic::Or: {
1904       Variable *T = makeReg(Ty);
1905       _movp(T, Src0);
1906       _por(T, Src1);
1907       _movp(Dest, T);
1908     } break;
1909     case InstArithmetic::Xor: {
1910       Variable *T = makeReg(Ty);
1911       _movp(T, Src0);
1912       _pxor(T, Src1);
1913       _movp(Dest, T);
1914     } break;
1915     case InstArithmetic::Sub: {
1916       Variable *T = makeReg(Ty);
1917       _movp(T, Src0);
1918       _psub(T, Src1);
1919       _movp(Dest, T);
1920     } break;
1921     case InstArithmetic::Mul: {
1922       bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
1923       bool InstructionSetIsValidForPmull =
1924           Ty == IceType_v8i16 || InstructionSet >= SSE4_1;
1925       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
1926         Variable *T = makeReg(Ty);
1927         _movp(T, Src0);
1928         _pmull(T, Src0 == Src1 ? T : Src1);
1929         _movp(Dest, T);
1930       } else if (Ty == IceType_v4i32) {
1931         // Lowering sequence:
1932         // Note: The mask arguments have index 0 on the left.
1933         //
1934         // movups  T1, Src0
1935         // pshufd  T2, Src0, {1,0,3,0}
1936         // pshufd  T3, Src1, {1,0,3,0}
1937         // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
1938         // pmuludq T1, Src1
1939         // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
1940         // pmuludq T2, T3
1941         // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
1942         // shufps  T1, T2, {0,2,0,2}
1943         // pshufd  T4, T1, {0,2,1,3}
1944         // movups  Dest, T4
1945 
1946         // Mask that directs pshufd to create a vector with entries
1947         // Src[1, 0, 3, 0]
1948         constexpr unsigned Constant1030 = 0x31;
1949         Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
1950         // Mask that directs shufps to create a vector with entries
1951         // Dest[0, 2], Src[0, 2]
1952         constexpr unsigned Mask0202 = 0x88;
1953         // Mask that directs pshufd to create a vector with entries
1954         // Src[0, 2, 1, 3]
1955         constexpr unsigned Mask0213 = 0xd8;
1956         Variable *T1 = makeReg(IceType_v4i32);
1957         Variable *T2 = makeReg(IceType_v4i32);
1958         Variable *T3 = makeReg(IceType_v4i32);
1959         Variable *T4 = makeReg(IceType_v4i32);
1960         _movp(T1, Src0);
1961         _pshufd(T2, Src0, Mask1030);
1962         _pshufd(T3, Src1, Mask1030);
1963         _pmuludq(T1, Src1);
1964         _pmuludq(T2, T3);
1965         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
1966         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
1967         _movp(Dest, T4);
1968       } else if (Ty == IceType_v16i8) {
1969         llvm::report_fatal_error("Scalarized operation was expected");
1970       } else {
1971         llvm::report_fatal_error("Invalid vector multiply type");
1972       }
1973     } break;
1974     case InstArithmetic::Shl: {
1975       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1976       Variable *T = makeReg(Ty);
1977       _movp(T, Src0);
1978       _psll(T, Src1);
1979       _movp(Dest, T);
1980     } break;
1981     case InstArithmetic::Lshr: {
1982       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1983       Variable *T = makeReg(Ty);
1984       _movp(T, Src0);
1985       _psrl(T, Src1);
1986       _movp(Dest, T);
1987     } break;
1988     case InstArithmetic::Ashr: {
1989       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1990       Variable *T = makeReg(Ty);
1991       _movp(T, Src0);
1992       _psra(T, Src1);
1993       _movp(Dest, T);
1994     } break;
1995     case InstArithmetic::Udiv:
1996     case InstArithmetic::Urem:
1997     case InstArithmetic::Sdiv:
1998     case InstArithmetic::Srem:
1999       llvm::report_fatal_error("Scalarized operation was expected");
2000       break;
2001     case InstArithmetic::Fadd: {
2002       Variable *T = makeReg(Ty);
2003       _movp(T, Src0);
2004       _addps(T, Src1);
2005       _movp(Dest, T);
2006     } break;
2007     case InstArithmetic::Fsub: {
2008       Variable *T = makeReg(Ty);
2009       _movp(T, Src0);
2010       _subps(T, Src1);
2011       _movp(Dest, T);
2012     } break;
2013     case InstArithmetic::Fmul: {
2014       Variable *T = makeReg(Ty);
2015       _movp(T, Src0);
2016       _mulps(T, Src0 == Src1 ? T : Src1);
2017       _movp(Dest, T);
2018     } break;
2019     case InstArithmetic::Fdiv: {
2020       Variable *T = makeReg(Ty);
2021       _movp(T, Src0);
2022       _divps(T, Src1);
2023       _movp(Dest, T);
2024     } break;
2025     case InstArithmetic::Frem:
2026       llvm::report_fatal_error("Scalarized operation was expected");
2027       break;
2028     }
2029     return;
2030   }
2031   Variable *T_edx = nullptr;
2032   Variable *T = nullptr;
2033   switch (Instr->getOp()) {
2034   case InstArithmetic::_num:
2035     llvm_unreachable("Unknown arithmetic operator");
2036     break;
2037   case InstArithmetic::Add: {
2038     const bool ValidType = Ty == IceType_i32;
2039     auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
2040     const bool ValidKind =
2041         Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
2042                              llvm::isa<ConstantRelocatable>(Const));
2043     if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
2044       auto *Var = legalizeToReg(Src0);
2045       auto *Mem = X86OperandMem::create(Func, IceType_void, Var, Const);
2046       T = makeReg(Ty);
2047       _lea(T, Mem);
2048       _mov(Dest, T);
2049       break;
2050     }
2051     _mov(T, Src0);
2052     _add(T, Src1);
2053     _mov(Dest, T);
2054   } break;
2055   case InstArithmetic::And:
2056     _mov(T, Src0);
2057     _and(T, Src1);
2058     _mov(Dest, T);
2059     break;
2060   case InstArithmetic::Or:
2061     _mov(T, Src0);
2062     _or(T, Src1);
2063     _mov(Dest, T);
2064     break;
2065   case InstArithmetic::Xor:
2066     _mov(T, Src0);
2067     _xor(T, Src1);
2068     _mov(Dest, T);
2069     break;
2070   case InstArithmetic::Sub:
2071     _mov(T, Src0);
2072     _sub(T, Src1);
2073     _mov(Dest, T);
2074     break;
2075   case InstArithmetic::Mul:
2076     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2077       if (optimizeScalarMul(Dest, Src0, C->getValue()))
2078         return;
2079     }
2080     // The 8-bit version of imul only allows the form "imul r/m8" where T must
2081     // be in al.
2082     if (isByteSizedArithType(Ty)) {
2083       _mov(T, Src0, RegX8632::Reg_al);
2084       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2085       _imul(T, Src0 == Src1 ? T : Src1);
2086       _mov(Dest, T);
2087     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2088       T = makeReg(Ty);
2089       Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
2090       _imul_imm(T, Src0, ImmConst);
2091       _mov(Dest, T);
2092     } else {
2093       _mov(T, Src0);
2094       // No need to legalize Src1 to Reg | Mem because the Imm case is handled
2095       // already by the ConstantInteger32 case above.
2096       _imul(T, Src0 == Src1 ? T : Src1);
2097       _mov(Dest, T);
2098     }
2099     break;
2100   case InstArithmetic::Shl:
2101     _mov(T, Src0);
2102     if (!llvm::isa<ConstantInteger32>(Src1) &&
2103         !llvm::isa<ConstantInteger64>(Src1))
2104       Src1 = copyToReg8(Src1, RegX8632::Reg_cl);
2105     _shl(T, Src1);
2106     _mov(Dest, T);
2107     break;
2108   case InstArithmetic::Lshr:
2109     _mov(T, Src0);
2110     if (!llvm::isa<ConstantInteger32>(Src1) &&
2111         !llvm::isa<ConstantInteger64>(Src1))
2112       Src1 = copyToReg8(Src1, RegX8632::Reg_cl);
2113     _shr(T, Src1);
2114     _mov(Dest, T);
2115     break;
2116   case InstArithmetic::Ashr:
2117     _mov(T, Src0);
2118     if (!llvm::isa<ConstantInteger32>(Src1) &&
2119         !llvm::isa<ConstantInteger64>(Src1))
2120       Src1 = copyToReg8(Src1, RegX8632::Reg_cl);
2121     _sar(T, Src1);
2122     _mov(Dest, T);
2123     break;
2124   case InstArithmetic::Udiv: {
2125     // div and idiv are the few arithmetic operators that do not allow
2126     // immediates as the operand.
2127     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2128     RegNumT Eax;
2129     RegNumT Edx;
2130     switch (Ty) {
2131     default:
2132       llvm::report_fatal_error("Bad type for udiv");
2133     case IceType_i32:
2134       Eax = RegX8632::Reg_eax;
2135       Edx = RegX8632::Reg_edx;
2136       break;
2137     case IceType_i16:
2138       Eax = RegX8632::Reg_ax;
2139       Edx = RegX8632::Reg_dx;
2140       break;
2141     case IceType_i8:
2142       Eax = RegX8632::Reg_al;
2143       Edx = RegX8632::Reg_ah;
2144       break;
2145     }
2146     T_edx = makeReg(Ty, Edx);
2147     _mov(T, Src0, Eax);
2148     _mov(T_edx, Ctx->getConstantZero(Ty));
2149     _div(T_edx, Src1, T);
2150     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2151     _mov(Dest, T);
2152   } break;
2153   case InstArithmetic::Sdiv:
2154     // TODO(stichnot): Enable this after doing better performance and cross
2155     // testing.
2156     if (false && Func->getOptLevel() >= Opt_1) {
2157       // Optimize division by constant power of 2, but not for Om1 or O0, just
2158       // to keep things simple there.
2159       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2160         const int32_t Divisor = C->getValue();
2161         const uint32_t UDivisor = Divisor;
2162         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2163           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2164           // LLVM does the following for dest=src/(1<<log):
2165           //   t=src
2166           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2167           //   shr t,typewidth-log
2168           //   add t,src
2169           //   sar t,log
2170           //   dest=t
2171           uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
2172           _mov(T, Src0);
2173           // If for some reason we are dividing by 1, just treat it like an
2174           // assignment.
2175           if (LogDiv > 0) {
2176             // The initial sar is unnecessary when dividing by 2.
2177             if (LogDiv > 1)
2178               _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2179             _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2180             _add(T, Src0);
2181             _sar(T, Ctx->getConstantInt(Ty, LogDiv));
2182           }
2183           _mov(Dest, T);
2184           return;
2185         }
2186       }
2187     }
2188     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2189     switch (Ty) {
2190     default:
2191       llvm::report_fatal_error("Bad type for sdiv");
2192     case IceType_i32:
2193       T_edx = makeReg(Ty, RegX8632::Reg_edx);
2194       _mov(T, Src0, RegX8632::Reg_eax);
2195       break;
2196     case IceType_i16:
2197       T_edx = makeReg(Ty, RegX8632::Reg_dx);
2198       _mov(T, Src0, RegX8632::Reg_ax);
2199       break;
2200     case IceType_i8:
2201       T_edx = makeReg(IceType_i16, RegX8632::Reg_ax);
2202       _mov(T, Src0, RegX8632::Reg_al);
2203       break;
2204     }
2205     _cbwdq(T_edx, T);
2206     _idiv(T_edx, Src1, T);
2207     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2208     _mov(Dest, T);
2209     break;
2210   case InstArithmetic::Urem: {
2211     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2212     RegNumT Eax;
2213     RegNumT Edx;
2214     switch (Ty) {
2215     default:
2216       llvm::report_fatal_error("Bad type for urem");
2217     case IceType_i32:
2218       Eax = RegX8632::Reg_eax;
2219       Edx = RegX8632::Reg_edx;
2220       break;
2221     case IceType_i16:
2222       Eax = RegX8632::Reg_ax;
2223       Edx = RegX8632::Reg_dx;
2224       break;
2225     case IceType_i8:
2226       Eax = RegX8632::Reg_al;
2227       Edx = RegX8632::Reg_ah;
2228       break;
2229     }
2230     T_edx = makeReg(Ty, Edx);
2231     _mov(T_edx, Ctx->getConstantZero(Ty));
2232     _mov(T, Src0, Eax);
2233     _div(T, Src1, T_edx);
2234     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2235     if (Ty == IceType_i8) {
2236       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2237       // moved into a general 8-bit register.
2238       auto *T_AhRcvr = makeReg(Ty);
2239       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2240       _mov(T_AhRcvr, T_edx);
2241       T_edx = T_AhRcvr;
2242     }
2243     _mov(Dest, T_edx);
2244   } break;
2245   case InstArithmetic::Srem: {
2246     // TODO(stichnot): Enable this after doing better performance and cross
2247     // testing.
2248     if (false && Func->getOptLevel() >= Opt_1) {
2249       // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2250       // keep things simple there.
2251       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2252         const int32_t Divisor = C->getValue();
2253         const uint32_t UDivisor = Divisor;
2254         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2255           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2256           // LLVM does the following for dest=src%(1<<log):
2257           //   t=src
2258           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2259           //   shr t,typewidth-log
2260           //   add t,src
2261           //   and t, -(1<<log)
2262           //   sub t,src
2263           //   neg t
2264           //   dest=t
2265           uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
2266           // If for some reason we are dividing by 1, just assign 0.
2267           if (LogDiv == 0) {
2268             _mov(Dest, Ctx->getConstantZero(Ty));
2269             return;
2270           }
2271           _mov(T, Src0);
2272           // The initial sar is unnecessary when dividing by 2.
2273           if (LogDiv > 1)
2274             _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2275           _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2276           _add(T, Src0);
2277           _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2278           _sub(T, Src0);
2279           _neg(T);
2280           _mov(Dest, T);
2281           return;
2282         }
2283       }
2284     }
2285     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2286     RegNumT Eax;
2287     RegNumT Edx;
2288     switch (Ty) {
2289     default:
2290       llvm::report_fatal_error("Bad type for srem");
2291     case IceType_i32:
2292       Eax = RegX8632::Reg_eax;
2293       Edx = RegX8632::Reg_edx;
2294       break;
2295     case IceType_i16:
2296       Eax = RegX8632::Reg_ax;
2297       Edx = RegX8632::Reg_dx;
2298       break;
2299     case IceType_i8:
2300       Eax = RegX8632::Reg_al;
2301       Edx = RegX8632::Reg_ah;
2302       break;
2303     }
2304     T_edx = makeReg(Ty, Edx);
2305     _mov(T, Src0, Eax);
2306     _cbwdq(T_edx, T);
2307     _idiv(T, Src1, T_edx);
2308     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2309     if (Ty == IceType_i8) {
2310       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2311       // moved into a general 8-bit register.
2312       auto *T_AhRcvr = makeReg(Ty);
2313       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2314       _mov(T_AhRcvr, T_edx);
2315       T_edx = T_AhRcvr;
2316     }
2317     _mov(Dest, T_edx);
2318   } break;
2319   case InstArithmetic::Fadd:
2320     _mov(T, Src0);
2321     _addss(T, Src1);
2322     _mov(Dest, T);
2323     break;
2324   case InstArithmetic::Fsub:
2325     _mov(T, Src0);
2326     _subss(T, Src1);
2327     _mov(Dest, T);
2328     break;
2329   case InstArithmetic::Fmul:
2330     _mov(T, Src0);
2331     _mulss(T, Src0 == Src1 ? T : Src1);
2332     _mov(Dest, T);
2333     break;
2334   case InstArithmetic::Fdiv:
2335     _mov(T, Src0);
2336     _divss(T, Src1);
2337     _mov(Dest, T);
2338     break;
2339   case InstArithmetic::Frem:
2340     llvm::report_fatal_error("Helper call was expected");
2341     break;
2342   }
2343 }
2344 
lowerAssign(const InstAssign * Instr)2345 void TargetX8632::lowerAssign(const InstAssign *Instr) {
2346   Variable *Dest = Instr->getDest();
2347   if (Dest->isRematerializable()) {
2348     Context.insert<InstFakeDef>(Dest);
2349     return;
2350   }
2351   Operand *Src = Instr->getSrc(0);
2352   assert(Dest->getType() == Src->getType());
2353   lowerMove(Dest, Src, false);
2354 }
2355 
lowerBr(const InstBr * Br)2356 void TargetX8632::lowerBr(const InstBr *Br) {
2357   if (Br->isUnconditional()) {
2358     _br(Br->getTargetUnconditional());
2359     return;
2360   }
2361   Operand *Cond = Br->getCondition();
2362 
2363   // Handle folding opportunities.
2364   if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2365     assert(Producer->isDeleted());
2366     switch (BoolFolding::getProducerKind(Producer)) {
2367     default:
2368       break;
2369     case BoolFolding::PK_Icmp32:
2370     case BoolFolding::PK_Icmp64: {
2371       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2372       return;
2373     }
2374     case BoolFolding::PK_Fcmp: {
2375       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2376       return;
2377     }
2378     case BoolFolding::PK_Arith: {
2379       lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2380       return;
2381     }
2382     }
2383   }
2384   Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2385   Constant *Zero = Ctx->getConstantZero(IceType_i32);
2386   _cmp(Src0, Zero);
2387   _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2388 }
2389 
2390 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2391 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
constexprMax(SizeT S0,SizeT S1)2392 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2393   return S0 < S1 ? S1 : S0;
2394 }
2395 
lowerCall(const InstCall * Instr)2396 void TargetX8632::lowerCall(const InstCall *Instr) {
2397   // System V x86-32 calling convention lowering:
2398   //
2399   // * At the point before the call, the stack must be aligned to 16 bytes.
2400   //
2401   // * Non-register arguments are pushed onto the stack in right-to-left order,
2402   // such that the left-most argument ends up on the top of the stack at the
2403   // lowest memory address.
2404   //
2405   // * Stack arguments of vector type are aligned to start at the next highest
2406   // multiple of 16 bytes. Other stack arguments are aligned to the next word
2407   // size boundary (4 or 8 bytes, respectively).
2408   //
2409   // This is compatible with the Microsoft x86-32 'cdecl' calling convention,
2410   // which doesn't have a 16-byte stack alignment requirement.
2411 
2412   RequiredStackAlignment =
2413       std::max<size_t>(RequiredStackAlignment, X86_STACK_ALIGNMENT_BYTES);
2414 
2415   constexpr SizeT MaxOperands =
2416       constexprMax(RegX8632::X86_MAX_XMM_ARGS, RegX8632::X86_MAX_GPR_ARGS);
2417   using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
2418 
2419   OperandList XmmArgs;
2420   llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
2421   CfgVector<std::pair<const Type, Operand *>> GprArgs;
2422   CfgVector<SizeT> GprArgIndices;
2423   OperandList StackArgs, StackArgLocations;
2424   uint32_t ParameterAreaSizeBytes = 0;
2425 
2426   // Classify each argument operand according to the location where the argument
2427   // is passed.
2428   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2429     Operand *Arg = Instr->getArg(i);
2430     const Type Ty = Arg->getType();
2431     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2432     assert(typeWidthInBytes(Ty) >= 4);
2433     if (isVectorType(Ty) && RegX8632::getRegisterForXmmArgNum(
2434                                 RegX8632::getArgIndex(i, XmmArgs.size()))
2435                                 .hasValue()) {
2436       XmmArgs.push_back(Arg);
2437       XmmArgIndices.push_back(i);
2438     } else if (isScalarIntegerType(Ty) &&
2439                RegX8632::getRegisterForGprArgNum(
2440                    Ty, RegX8632::getArgIndex(i, GprArgs.size()))
2441                    .hasValue()) {
2442       GprArgs.emplace_back(Ty, Arg);
2443       GprArgIndices.push_back(i);
2444     } else {
2445       // Place on stack.
2446       StackArgs.push_back(Arg);
2447       if (isVectorType(Arg->getType())) {
2448         ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2449       }
2450       Variable *esp = getPhysicalRegister(getStackReg(), WordType);
2451       Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2452       StackArgLocations.push_back(X86OperandMem::create(Func, Ty, esp, Loc));
2453       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2454     }
2455   }
2456   // Ensure there is enough space for the fstp/movs for floating returns.
2457   Variable *Dest = Instr->getDest();
2458   const Type DestTy = Dest ? Dest->getType() : IceType_void;
2459   if (isScalarFloatingType(DestTy)) {
2460     ParameterAreaSizeBytes =
2461         std::max(static_cast<size_t>(ParameterAreaSizeBytes),
2462                  typeWidthInBytesOnStack(DestTy));
2463   }
2464   // Adjust the parameter area so that the stack is aligned. It is assumed that
2465   // the stack is already aligned at the start of the calling sequence.
2466   ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2467   assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2468   // Copy arguments that are passed on the stack to the appropriate stack
2469   // locations.  We make sure legalize() is called on each argument at this
2470   // point, to allow availabilityGet() to work.
2471   for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2472     lowerStore(
2473         InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2474   }
2475   // Copy arguments to be passed in registers to the appropriate registers.
2476   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2477     XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
2478                                RegX8632::getRegisterForXmmArgNum(
2479                                    RegX8632::getArgIndex(XmmArgIndices[i], i)));
2480   }
2481   // Materialize moves for arguments passed in GPRs.
2482   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2483     const Type SignatureTy = GprArgs[i].first;
2484     Operand *Arg =
2485         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2486     GprArgs[i].second = legalizeToReg(
2487         Arg, RegX8632::getRegisterForGprArgNum(
2488                  Arg->getType(), RegX8632::getArgIndex(GprArgIndices[i], i)));
2489     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2490     assert(SignatureTy == Arg->getType());
2491     (void)SignatureTy;
2492   }
2493   // Generate a FakeUse of register arguments so that they do not get dead code
2494   // eliminated as a result of the FakeKill of scratch registers after the call.
2495   // These need to be right before the call instruction.
2496   for (auto *Arg : XmmArgs) {
2497     Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2498   }
2499   for (auto &ArgPair : GprArgs) {
2500     Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2501   }
2502   // Generate the call instruction. Assign its result to a temporary with high
2503   // register allocation weight.
2504   // ReturnReg doubles as ReturnRegLo as necessary.
2505   Variable *ReturnReg = nullptr;
2506   Variable *ReturnRegHi = nullptr;
2507   if (Dest) {
2508     switch (DestTy) {
2509     case IceType_NUM:
2510     case IceType_void:
2511     case IceType_i1:
2512     case IceType_i8:
2513     case IceType_i16:
2514       llvm::report_fatal_error("Invalid Call dest type");
2515       break;
2516     case IceType_i32:
2517       ReturnReg = makeReg(DestTy, RegX8632::Reg_eax);
2518       break;
2519     case IceType_i64:
2520       ReturnReg = makeReg(IceType_i32, RegX8632::Reg_eax);
2521       ReturnRegHi = makeReg(IceType_i32, RegX8632::Reg_edx);
2522       break;
2523     case IceType_f32:
2524     case IceType_f64:
2525       // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
2526       // the fstp instruction.
2527       break;
2528     // Fallthrough intended.
2529     case IceType_v4i1:
2530     case IceType_v8i1:
2531     case IceType_v16i1:
2532     case IceType_v16i8:
2533     case IceType_v8i16:
2534     case IceType_v4i32:
2535     case IceType_v4f32:
2536       ReturnReg = makeReg(DestTy, RegX8632::Reg_xmm0);
2537       break;
2538     }
2539   }
2540   // Emit the call to the function.
2541   Operand *CallTarget =
2542       legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2543   size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
2544   Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
2545   // Keep the upper return register live on 32-bit platform.
2546   if (ReturnRegHi)
2547     Context.insert<InstFakeDef>(ReturnRegHi);
2548   // Mark the call as killing all the caller-save registers.
2549   Context.insert<InstFakeKill>(NewCall);
2550   // Handle x86-32 floating point returns.
2551   if (Dest != nullptr && isScalarFloatingType(DestTy)) {
2552     // Special treatment for an FP function which returns its result in st(0).
2553     // If Dest ends up being a physical xmm register, the fstp emit code will
2554     // route st(0) through the space reserved in the function argument area
2555     // we allocated.
2556     _fstp(Dest);
2557     // Create a fake use of Dest in case it actually isn't used, because st(0)
2558     // still needs to be popped.
2559     Context.insert<InstFakeUse>(Dest);
2560   }
2561   // Generate a FakeUse to keep the call live if necessary.
2562   if (Instr->hasSideEffects() && ReturnReg) {
2563     Context.insert<InstFakeUse>(ReturnReg);
2564   }
2565   // Process the return value, if any.
2566   if (Dest == nullptr)
2567     return;
2568   // Assign the result of the call to Dest.  Route it through a temporary so
2569   // that the local register availability peephole can be subsequently used.
2570   Variable *Tmp = nullptr;
2571   if (isVectorType(DestTy)) {
2572     assert(ReturnReg && "Vector type requires a return register");
2573     Tmp = makeReg(DestTy);
2574     _movp(Tmp, ReturnReg);
2575     _movp(Dest, Tmp);
2576   } else if (!isScalarFloatingType(DestTy)) {
2577     assert(isScalarIntegerType(DestTy));
2578     assert(ReturnReg && "Integer type requires a return register");
2579     if (DestTy == IceType_i64) {
2580       assert(ReturnRegHi && "64-bit type requires two return registers");
2581       auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
2582       Variable *DestLo = Dest64On32->getLo();
2583       Variable *DestHi = Dest64On32->getHi();
2584       _mov(Tmp, ReturnReg);
2585       _mov(DestLo, Tmp);
2586       Variable *TmpHi = nullptr;
2587       _mov(TmpHi, ReturnRegHi);
2588       _mov(DestHi, TmpHi);
2589     } else {
2590       _mov(Tmp, ReturnReg);
2591       _mov(Dest, Tmp);
2592     }
2593   }
2594 }
2595 
lowerCast(const InstCast * Instr)2596 void TargetX8632::lowerCast(const InstCast *Instr) {
2597   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2598   InstCast::OpKind CastKind = Instr->getCastKind();
2599   Variable *Dest = Instr->getDest();
2600   Type DestTy = Dest->getType();
2601   switch (CastKind) {
2602   default:
2603     Func->setError("Cast type not supported");
2604     return;
2605   case InstCast::Sext: {
2606     // Src0RM is the source operand legalized to physical register or memory,
2607     // but not immediate, since the relevant x86 native instructions don't
2608     // allow an immediate operand. If the operand is an immediate, we could
2609     // consider computing the strength-reduced result at translation time, but
2610     // we're unlikely to see something like that in the bitcode that the
2611     // optimizer wouldn't have already taken care of.
2612     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2613     if (isVectorType(DestTy)) {
2614       if (DestTy == IceType_v16i8) {
2615         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2616         Variable *OneMask = makeVectorOfOnes(DestTy);
2617         Variable *T = makeReg(DestTy);
2618         _movp(T, Src0RM);
2619         _pand(T, OneMask);
2620         Variable *Zeros = makeVectorOfZeros(DestTy);
2621         _pcmpgt(T, Zeros);
2622         _movp(Dest, T);
2623       } else {
2624         /// width = width(elty) - 1; dest = (src << width) >> width
2625         SizeT ShiftAmount =
2626             X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
2627         Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2628         Variable *T = makeReg(DestTy);
2629         _movp(T, Src0RM);
2630         _psll(T, ShiftConstant);
2631         _psra(T, ShiftConstant);
2632         _movp(Dest, T);
2633       }
2634     } else if (DestTy == IceType_i64) {
2635       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
2636       Constant *Shift = Ctx->getConstantInt32(31);
2637       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2638       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2639       Variable *T_Lo = makeReg(DestLo->getType());
2640       if (Src0RM->getType() == IceType_i32) {
2641         _mov(T_Lo, Src0RM);
2642       } else if (Src0RM->getType() == IceType_i1) {
2643         _movzx(T_Lo, Src0RM);
2644         _shl(T_Lo, Shift);
2645         _sar(T_Lo, Shift);
2646       } else {
2647         _movsx(T_Lo, Src0RM);
2648       }
2649       _mov(DestLo, T_Lo);
2650       Variable *T_Hi = nullptr;
2651       _mov(T_Hi, T_Lo);
2652       if (Src0RM->getType() != IceType_i1)
2653         // For i1, the sar instruction is already done above.
2654         _sar(T_Hi, Shift);
2655       _mov(DestHi, T_Hi);
2656     } else if (Src0RM->getType() == IceType_i1) {
2657       // t1 = src
2658       // shl t1, dst_bitwidth - 1
2659       // sar t1, dst_bitwidth - 1
2660       // dst = t1
2661       size_t DestBits = X86_CHAR_BIT * typeWidthInBytes(DestTy);
2662       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2663       Variable *T = makeReg(DestTy);
2664       if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2665         _mov(T, Src0RM);
2666       } else {
2667         // Widen the source using movsx or movzx. (It doesn't matter which one,
2668         // since the following shl/sar overwrite the bits.)
2669         _movzx(T, Src0RM);
2670       }
2671       _shl(T, ShiftAmount);
2672       _sar(T, ShiftAmount);
2673       _mov(Dest, T);
2674     } else {
2675       // t1 = movsx src; dst = t1
2676       Variable *T = makeReg(DestTy);
2677       _movsx(T, Src0RM);
2678       _mov(Dest, T);
2679     }
2680     break;
2681   }
2682   case InstCast::Zext: {
2683     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2684     if (isVectorType(DestTy)) {
2685       // onemask = materialize(1,1,...); dest = onemask & src
2686       Variable *OneMask = makeVectorOfOnes(DestTy);
2687       Variable *T = makeReg(DestTy);
2688       _movp(T, Src0RM);
2689       _pand(T, OneMask);
2690       _movp(Dest, T);
2691     } else if (DestTy == IceType_i64) {
2692       // t1=movzx src; dst.lo=t1; dst.hi=0
2693       Constant *Zero = Ctx->getConstantZero(IceType_i32);
2694       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2695       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2696       Variable *Tmp = makeReg(DestLo->getType());
2697       if (Src0RM->getType() == IceType_i32) {
2698         _mov(Tmp, Src0RM);
2699       } else {
2700         _movzx(Tmp, Src0RM);
2701       }
2702       _mov(DestLo, Tmp);
2703       _mov(DestHi, Zero);
2704     } else if (Src0RM->getType() == IceType_i1) {
2705       // t = Src0RM; Dest = t
2706       Variable *T = nullptr;
2707       if (DestTy == IceType_i8) {
2708         _mov(T, Src0RM);
2709       } else {
2710         assert(DestTy != IceType_i1);
2711         assert(DestTy != IceType_i64);
2712         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2713         // In x86-64 we need to widen T to 64-bits to ensure that T -- if
2714         // written to the stack (i.e., in -Om1) will be fully zero-extended.
2715         T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
2716         _movzx(T, Src0RM);
2717       }
2718       _mov(Dest, T);
2719     } else {
2720       // t1 = movzx src; dst = t1
2721       Variable *T = makeReg(DestTy);
2722       _movzx(T, Src0RM);
2723       _mov(Dest, T);
2724     }
2725     break;
2726   }
2727   case InstCast::Trunc: {
2728     if (isVectorType(DestTy)) {
2729       // onemask = materialize(1,1,...); dst = src & onemask
2730       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2731       Type Src0Ty = Src0RM->getType();
2732       Variable *OneMask = makeVectorOfOnes(Src0Ty);
2733       Variable *T = makeReg(DestTy);
2734       _movp(T, Src0RM);
2735       _pand(T, OneMask);
2736       _movp(Dest, T);
2737     } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
2738       // Make sure we truncate from and into valid registers.
2739       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2740       if (Src0->getType() == IceType_i64)
2741         Src0 = loOperand(Src0);
2742       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2743       Variable *T = copyToReg8(Src0RM);
2744       if (DestTy == IceType_i1)
2745         _and(T, Ctx->getConstantInt1(1));
2746       _mov(Dest, T);
2747     } else {
2748       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2749       if (Src0->getType() == IceType_i64)
2750         Src0 = loOperand(Src0);
2751       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2752       // t1 = trunc Src0RM; Dest = t1
2753       Variable *T = makeReg(DestTy);
2754       _mov(T, Src0RM);
2755       _mov(Dest, T);
2756     }
2757     break;
2758   }
2759   case InstCast::Fptrunc:
2760   case InstCast::Fpext: {
2761     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2762     // t1 = cvt Src0RM; Dest = t1
2763     Variable *T = makeReg(DestTy);
2764     _cvt(T, Src0RM, Insts::Cvt::Float2float);
2765     _mov(Dest, T);
2766     break;
2767   }
2768   case InstCast::Fptosi:
2769     if (isVectorType(DestTy)) {
2770       assert(DestTy == IceType_v4i32);
2771       assert(Instr->getSrc(0)->getType() == IceType_v4f32);
2772       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2773       Variable *T = makeReg(DestTy);
2774       _cvt(T, Src0R, Insts::Cvt::Tps2dq);
2775       _movp(Dest, T);
2776     } else if (DestTy == IceType_i64) {
2777       llvm::report_fatal_error("Helper call was expected");
2778     } else {
2779       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2780       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2781       Variable *T_1 = nullptr;
2782       assert(DestTy != IceType_i64);
2783       T_1 = makeReg(IceType_i32);
2784       // cvt() requires its integer argument to be a GPR.
2785       Variable *T_2 = makeReg(DestTy);
2786       if (isByteSizedType(DestTy)) {
2787         assert(T_1->getType() == IceType_i32);
2788         T_1->setRegClass(RCX86_Is32To8);
2789         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2790       }
2791       _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2792       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2793       if (DestTy == IceType_i1)
2794         _and(T_2, Ctx->getConstantInt1(1));
2795       _mov(Dest, T_2);
2796     }
2797     break;
2798   case InstCast::Fptoui:
2799     if (isVectorType(DestTy)) {
2800       llvm::report_fatal_error("Helper call was expected");
2801     } else if (DestTy == IceType_i64 || DestTy == IceType_i32) {
2802       llvm::report_fatal_error("Helper call was expected");
2803     } else {
2804       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2805       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2806       assert(DestTy != IceType_i64);
2807       Variable *T_1 = nullptr;
2808       assert(DestTy != IceType_i32);
2809       T_1 = makeReg(IceType_i32);
2810       Variable *T_2 = makeReg(DestTy);
2811       if (isByteSizedType(DestTy)) {
2812         assert(T_1->getType() == IceType_i32);
2813         T_1->setRegClass(RCX86_Is32To8);
2814         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2815       }
2816       _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2817       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2818       if (DestTy == IceType_i1)
2819         _and(T_2, Ctx->getConstantInt1(1));
2820       _mov(Dest, T_2);
2821     }
2822     break;
2823   case InstCast::Sitofp:
2824     if (isVectorType(DestTy)) {
2825       assert(DestTy == IceType_v4f32);
2826       assert(Instr->getSrc(0)->getType() == IceType_v4i32);
2827       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2828       Variable *T = makeReg(DestTy);
2829       _cvt(T, Src0R, Insts::Cvt::Dq2ps);
2830       _movp(Dest, T);
2831     } else if (Instr->getSrc(0)->getType() == IceType_i64) {
2832       llvm::report_fatal_error("Helper call was expected");
2833     } else {
2834       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2835       // Sign-extend the operand.
2836       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
2837       Variable *T_1 = nullptr;
2838       assert(Src0RM->getType() != IceType_i64);
2839       T_1 = makeReg(IceType_i32);
2840       Variable *T_2 = makeReg(DestTy);
2841       if (Src0RM->getType() == T_1->getType())
2842         _mov(T_1, Src0RM);
2843       else
2844         _movsx(T_1, Src0RM);
2845       _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2846       _mov(Dest, T_2);
2847     }
2848     break;
2849   case InstCast::Uitofp: {
2850     Operand *Src0 = Instr->getSrc(0);
2851     if (isVectorType(Src0->getType())) {
2852       llvm::report_fatal_error("Helper call was expected");
2853     } else if (Src0->getType() == IceType_i64 ||
2854                Src0->getType() == IceType_i32) {
2855       llvm::report_fatal_error("Helper call was expected");
2856     } else {
2857       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2858       // Zero-extend the operand.
2859       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
2860       Variable *T_1 = nullptr;
2861       assert(Src0RM->getType() != IceType_i64);
2862       assert(Src0RM->getType() != IceType_i32);
2863       T_1 = makeReg(IceType_i32);
2864       Variable *T_2 = makeReg(DestTy);
2865       if (Src0RM->getType() == T_1->getType())
2866         _mov(T_1, Src0RM);
2867       else
2868         _movzx(T_1, Src0RM)->setMustKeep();
2869       _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2870       _mov(Dest, T_2);
2871     }
2872     break;
2873   }
2874   case InstCast::Bitcast: {
2875     Operand *Src0 = Instr->getSrc(0);
2876     if (DestTy == Src0->getType()) {
2877       auto *Assign = InstAssign::create(Func, Dest, Src0);
2878       lowerAssign(Assign);
2879       return;
2880     }
2881     switch (DestTy) {
2882     default:
2883       llvm_unreachable("Unexpected Bitcast dest type");
2884     case IceType_i8: {
2885       llvm::report_fatal_error("Helper call was expected");
2886     } break;
2887     case IceType_i16: {
2888       llvm::report_fatal_error("Helper call was expected");
2889     } break;
2890     case IceType_i32:
2891     case IceType_f32: {
2892       Variable *Src0R = legalizeToReg(Src0);
2893       Variable *T = makeReg(DestTy);
2894       _movd(T, Src0R);
2895       _mov(Dest, T);
2896     } break;
2897     case IceType_i64: {
2898       assert(Src0->getType() == IceType_f64);
2899       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2900       // a.i64 = bitcast b.f64 ==>
2901       //   s.f64 = spill b.f64
2902       //   t_lo.i32 = lo(s.f64)
2903       //   a_lo.i32 = t_lo.i32
2904       //   t_hi.i32 = hi(s.f64)
2905       //   a_hi.i32 = t_hi.i32
2906       Operand *SpillLo, *SpillHi;
2907       if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
2908         Variable *Spill = Func->makeVariable(IceType_f64);
2909         Spill->setLinkedTo(Src0Var);
2910         Spill->setMustNotHaveReg();
2911         _movq(Spill, Src0RM);
2912         SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
2913         SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
2914       } else {
2915         SpillLo = loOperand(Src0RM);
2916         SpillHi = hiOperand(Src0RM);
2917       }
2918 
2919       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2920       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2921       Variable *T_Lo = makeReg(IceType_i32);
2922       Variable *T_Hi = makeReg(IceType_i32);
2923 
2924       _mov(T_Lo, SpillLo);
2925       _mov(DestLo, T_Lo);
2926       _mov(T_Hi, SpillHi);
2927       _mov(DestHi, T_Hi);
2928     } break;
2929     case IceType_f64: {
2930       assert(Src0->getType() == IceType_i64);
2931       Src0 = legalize(Src0);
2932       if (llvm::isa<X86OperandMem>(Src0)) {
2933         Variable *T = makeReg(DestTy);
2934         _movq(T, Src0);
2935         _movq(Dest, T);
2936         break;
2937       }
2938       // a.f64 = bitcast b.i64 ==>
2939       //   t_lo.i32 = b_lo.i32
2940       //   FakeDef(s.f64)
2941       //   lo(s.f64) = t_lo.i32
2942       //   t_hi.i32 = b_hi.i32
2943       //   hi(s.f64) = t_hi.i32
2944       //   a.f64 = s.f64
2945       Variable *Spill = Func->makeVariable(IceType_f64);
2946       Spill->setLinkedTo(Dest);
2947       Spill->setMustNotHaveReg();
2948 
2949       Variable *T_Lo = nullptr, *T_Hi = nullptr;
2950       auto *SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
2951       auto *SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
2952       _mov(T_Lo, loOperand(Src0));
2953       // Technically, the Spill is defined after the _store happens, but
2954       // SpillLo is considered a "use" of Spill so define Spill before it is
2955       // used.
2956       Context.insert<InstFakeDef>(Spill);
2957       _store(T_Lo, SpillLo);
2958       _mov(T_Hi, hiOperand(Src0));
2959       _store(T_Hi, SpillHi);
2960       _movq(Dest, Spill);
2961     } break;
2962     case IceType_v8i1: {
2963       llvm::report_fatal_error("Helper call was expected");
2964     } break;
2965     case IceType_v16i1: {
2966       llvm::report_fatal_error("Helper call was expected");
2967     } break;
2968     case IceType_v8i16:
2969     case IceType_v16i8:
2970     case IceType_v4i32:
2971     case IceType_v4f32: {
2972       if (Src0->getType() == IceType_i32) {
2973         // Bitcast requires equal type sizes, which isn't strictly the case
2974         // between scalars and vectors, but to emulate v4i8 vectors one has to
2975         // use v16i8 vectors.
2976         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2977         Variable *T = makeReg(DestTy);
2978         _movd(T, Src0RM);
2979         _mov(Dest, T);
2980       } else {
2981         _movp(Dest, legalizeToReg(Src0));
2982       }
2983     } break;
2984     }
2985     break;
2986   }
2987   }
2988 }
2989 
lowerExtractElement(const InstExtractElement * Instr)2990 void TargetX8632::lowerExtractElement(const InstExtractElement *Instr) {
2991   Operand *SourceVectNotLegalized = Instr->getSrc(0);
2992   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
2993   // Only constant indices are allowed in PNaCl IR.
2994   assert(ElementIndex);
2995 
2996   unsigned Index = ElementIndex->getValue();
2997   Type Ty = SourceVectNotLegalized->getType();
2998   Type ElementTy = typeElementType(Ty);
2999   Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
3000 
3001   // TODO(wala): Determine the best lowering sequences for each type.
3002   bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3003                      (InstructionSet >= SSE4_1 && Ty != IceType_v4f32);
3004   Variable *ExtractedElementR =
3005       makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
3006   if (CanUsePextr) {
3007     // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
3008     // bits of the destination register, so we represent this by always
3009     // extracting into an i32 register.  The _mov into Dest below will do
3010     // truncation as necessary.
3011     Constant *Mask = Ctx->getConstantInt32(Index);
3012     Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
3013     _pextr(ExtractedElementR, SourceVectR, Mask);
3014   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3015     // Use pshufd and movd/movss.
3016     Variable *T = nullptr;
3017     if (Index) {
3018       // The shuffle only needs to occur if the element to be extracted is not
3019       // at the lowest index.
3020       Constant *Mask = Ctx->getConstantInt32(Index);
3021       T = makeReg(Ty);
3022       _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
3023     } else {
3024       T = legalizeToReg(SourceVectNotLegalized);
3025     }
3026 
3027     if (InVectorElementTy == IceType_i32) {
3028       _movd(ExtractedElementR, T);
3029     } else { // Ty == IceType_f32
3030       // TODO(wala): _movss is only used here because _mov does not allow a
3031       // vector source and a scalar destination.  _mov should be able to be
3032       // used here.
3033       // _movss is a binary instruction, so the FakeDef is needed to keep the
3034       // live range analysis consistent.
3035       Context.insert<InstFakeDef>(ExtractedElementR);
3036       _movss(ExtractedElementR, T);
3037     }
3038   } else {
3039     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3040     // Spill the value to a stack slot and do the extraction in memory.
3041     //
3042     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3043     // for legalizing to mem is implemented.
3044     Variable *Slot = Func->makeVariable(Ty);
3045     Slot->setMustNotHaveReg();
3046     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3047 
3048     // Compute the location of the element in memory.
3049     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3050     X86OperandMem *Loc =
3051         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3052     _mov(ExtractedElementR, Loc);
3053   }
3054 
3055   if (ElementTy == IceType_i1) {
3056     // Truncate extracted integers to i1s if necessary.
3057     Variable *T = makeReg(IceType_i1);
3058     InstCast *Cast =
3059         InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
3060     lowerCast(Cast);
3061     ExtractedElementR = T;
3062   }
3063 
3064   // Copy the element to the destination.
3065   Variable *Dest = Instr->getDest();
3066   _mov(Dest, ExtractedElementR);
3067 }
3068 
lowerFcmp(const InstFcmp * Fcmp)3069 void TargetX8632::lowerFcmp(const InstFcmp *Fcmp) {
3070   Variable *Dest = Fcmp->getDest();
3071 
3072   if (isVectorType(Dest->getType())) {
3073     lowerFcmpVector(Fcmp);
3074   } else {
3075     constexpr Inst *Consumer = nullptr;
3076     lowerFcmpAndConsumer(Fcmp, Consumer);
3077   }
3078 }
3079 
lowerFcmpAndConsumer(const InstFcmp * Fcmp,const Inst * Consumer)3080 void TargetX8632::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
3081                                        const Inst *Consumer) {
3082   Operand *Src0 = Fcmp->getSrc(0);
3083   Operand *Src1 = Fcmp->getSrc(1);
3084   Variable *Dest = Fcmp->getDest();
3085 
3086   if (Consumer != nullptr) {
3087     if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3088       if (lowerOptimizeFcmpSelect(Fcmp, Select))
3089         return;
3090     }
3091   }
3092 
3093   if (isVectorType(Dest->getType())) {
3094     lowerFcmp(Fcmp);
3095     if (Consumer != nullptr)
3096       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3097     return;
3098   }
3099 
3100   // Lowering a = fcmp cond, b, c
3101   //   ucomiss b, c       /* only if C1 != Br_None */
3102   //                      /* but swap b,c order if SwapOperands==true */
3103   //   mov a, <default>
3104   //   j<C1> label        /* only if C1 != Br_None */
3105   //   j<C2> label        /* only if C2 != Br_None */
3106   //   FakeUse(a)         /* only if C1 != Br_None */
3107   //   mov a, !<default>  /* only if C1 != Br_None */
3108   //   label:             /* only if C1 != Br_None */
3109   //
3110   // setcc lowering when C1 != Br_None && C2 == Br_None:
3111   //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
3112   //   setcc a, C1
3113   InstFcmp::FCond Condition = Fcmp->getCondition();
3114   assert(static_cast<size_t>(Condition) < TableFcmpSize);
3115   if (TableFcmp[Condition].SwapScalarOperands)
3116     std::swap(Src0, Src1);
3117   const bool HasC1 = (TableFcmp[Condition].C1 != CondX86::Br_None);
3118   const bool HasC2 = (TableFcmp[Condition].C2 != CondX86::Br_None);
3119   if (HasC1) {
3120     Src0 = legalize(Src0);
3121     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3122     Variable *T = nullptr;
3123     _mov(T, Src0);
3124     _ucomiss(T, Src1RM);
3125     if (!HasC2) {
3126       assert(TableFcmp[Condition].Default);
3127       setccOrConsumer(TableFcmp[Condition].C1, Dest, Consumer);
3128       return;
3129     }
3130   }
3131   int32_t IntDefault = TableFcmp[Condition].Default;
3132   if (Consumer == nullptr) {
3133     Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
3134     _mov(Dest, Default);
3135     if (HasC1) {
3136       InstX86Label *Label = InstX86Label::create(Func, this);
3137       _br(TableFcmp[Condition].C1, Label);
3138       if (HasC2) {
3139         _br(TableFcmp[Condition].C2, Label);
3140       }
3141       Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
3142       _redefined(_mov(Dest, NonDefault));
3143       Context.insert(Label);
3144     }
3145     return;
3146   }
3147   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3148     CfgNode *TrueSucc = Br->getTargetTrue();
3149     CfgNode *FalseSucc = Br->getTargetFalse();
3150     if (IntDefault != 0)
3151       std::swap(TrueSucc, FalseSucc);
3152     if (HasC1) {
3153       _br(TableFcmp[Condition].C1, FalseSucc);
3154       if (HasC2) {
3155         _br(TableFcmp[Condition].C2, FalseSucc);
3156       }
3157       _br(TrueSucc);
3158       return;
3159     }
3160     _br(FalseSucc);
3161     return;
3162   }
3163   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3164     Operand *SrcT = Select->getTrueOperand();
3165     Operand *SrcF = Select->getFalseOperand();
3166     Variable *SelectDest = Select->getDest();
3167     if (IntDefault != 0)
3168       std::swap(SrcT, SrcF);
3169     lowerMove(SelectDest, SrcF, false);
3170     if (HasC1) {
3171       InstX86Label *Label = InstX86Label::create(Func, this);
3172       _br(TableFcmp[Condition].C1, Label);
3173       if (HasC2) {
3174         _br(TableFcmp[Condition].C2, Label);
3175       }
3176       static constexpr bool IsRedefinition = true;
3177       lowerMove(SelectDest, SrcT, IsRedefinition);
3178       Context.insert(Label);
3179     }
3180     return;
3181   }
3182   llvm::report_fatal_error("Unexpected consumer type");
3183 }
3184 
lowerFcmpVector(const InstFcmp * Fcmp)3185 void TargetX8632::lowerFcmpVector(const InstFcmp *Fcmp) {
3186   Operand *Src0 = Fcmp->getSrc(0);
3187   Operand *Src1 = Fcmp->getSrc(1);
3188   Variable *Dest = Fcmp->getDest();
3189 
3190   if (!isVectorType(Dest->getType()))
3191     llvm::report_fatal_error("Expected vector compare");
3192 
3193   InstFcmp::FCond Condition = Fcmp->getCondition();
3194   assert(static_cast<size_t>(Condition) < TableFcmpSize);
3195 
3196   if (TableFcmp[Condition].SwapVectorOperands)
3197     std::swap(Src0, Src1);
3198 
3199   Variable *T = nullptr;
3200 
3201   if (Condition == InstFcmp::True) {
3202     // makeVectorOfOnes() requires an integer vector type.
3203     T = makeVectorOfMinusOnes(IceType_v4i32);
3204   } else if (Condition == InstFcmp::False) {
3205     T = makeVectorOfZeros(Dest->getType());
3206   } else {
3207     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3208     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3209     if (llvm::isa<X86OperandMem>(Src1RM))
3210       Src1RM = legalizeToReg(Src1RM);
3211 
3212     switch (Condition) {
3213     default: {
3214       const CmppsCond Predicate = TableFcmp[Condition].Predicate;
3215       assert(Predicate != CondX86::Cmpps_Invalid);
3216       T = makeReg(Src0RM->getType());
3217       _movp(T, Src0RM);
3218       _cmpps(T, Src1RM, Predicate);
3219     } break;
3220     case InstFcmp::One: {
3221       // Check both unequal and ordered.
3222       T = makeReg(Src0RM->getType());
3223       Variable *T2 = makeReg(Src0RM->getType());
3224       _movp(T, Src0RM);
3225       _cmpps(T, Src1RM, CondX86::Cmpps_neq);
3226       _movp(T2, Src0RM);
3227       _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
3228       _pand(T, T2);
3229     } break;
3230     case InstFcmp::Ueq: {
3231       // Check both equal or unordered.
3232       T = makeReg(Src0RM->getType());
3233       Variable *T2 = makeReg(Src0RM->getType());
3234       _movp(T, Src0RM);
3235       _cmpps(T, Src1RM, CondX86::Cmpps_eq);
3236       _movp(T2, Src0RM);
3237       _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
3238       _por(T, T2);
3239     } break;
3240     }
3241   }
3242 
3243   assert(T != nullptr);
3244   _movp(Dest, T);
3245   eliminateNextVectorSextInstruction(Dest);
3246 }
3247 
isZero(const Operand * Opnd)3248 inline bool isZero(const Operand *Opnd) {
3249   if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
3250     return C64->getValue() == 0;
3251   if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
3252     return C32->getValue() == 0;
3253   return false;
3254 }
3255 
lowerIcmpAndConsumer(const InstIcmp * Icmp,const Inst * Consumer)3256 void TargetX8632::lowerIcmpAndConsumer(const InstIcmp *Icmp,
3257                                        const Inst *Consumer) {
3258   Operand *Src0 = legalize(Icmp->getSrc(0));
3259   Operand *Src1 = legalize(Icmp->getSrc(1));
3260   Variable *Dest = Icmp->getDest();
3261 
3262   if (isVectorType(Dest->getType())) {
3263     lowerIcmp(Icmp);
3264     if (Consumer != nullptr)
3265       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3266     return;
3267   }
3268 
3269   if (Src0->getType() == IceType_i64) {
3270     lowerIcmp64(Icmp, Consumer);
3271     return;
3272   }
3273 
3274   // cmp b, c
3275   if (isZero(Src1)) {
3276     switch (Icmp->getCondition()) {
3277     default:
3278       break;
3279     case InstIcmp::Uge:
3280       movOrConsumer(true, Dest, Consumer);
3281       return;
3282     case InstIcmp::Ult:
3283       movOrConsumer(false, Dest, Consumer);
3284       return;
3285     }
3286   }
3287   Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
3288   _cmp(Src0RM, Src1);
3289   setccOrConsumer(getIcmp32Mapping(Icmp->getCondition()), Dest, Consumer);
3290 }
3291 
lowerIcmpVector(const InstIcmp * Icmp)3292 void TargetX8632::lowerIcmpVector(const InstIcmp *Icmp) {
3293   Operand *Src0 = legalize(Icmp->getSrc(0));
3294   Operand *Src1 = legalize(Icmp->getSrc(1));
3295   Variable *Dest = Icmp->getDest();
3296 
3297   if (!isVectorType(Dest->getType()))
3298     llvm::report_fatal_error("Expected a vector compare");
3299 
3300   Type Ty = Src0->getType();
3301   // Promote i1 vectors to 128 bit integer vector types.
3302   if (typeElementType(Ty) == IceType_i1) {
3303     Type NewTy = IceType_NUM;
3304     switch (Ty) {
3305     default:
3306       llvm::report_fatal_error("unexpected type");
3307       break;
3308     case IceType_v4i1:
3309       NewTy = IceType_v4i32;
3310       break;
3311     case IceType_v8i1:
3312       NewTy = IceType_v8i16;
3313       break;
3314     case IceType_v16i1:
3315       NewTy = IceType_v16i8;
3316       break;
3317     }
3318     Variable *NewSrc0 = Func->makeVariable(NewTy);
3319     Variable *NewSrc1 = Func->makeVariable(NewTy);
3320     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3321     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3322     Src0 = NewSrc0;
3323     Src1 = NewSrc1;
3324     Ty = NewTy;
3325   }
3326 
3327   InstIcmp::ICond Condition = Icmp->getCondition();
3328 
3329   Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3330   Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3331 
3332   // SSE2 only has signed comparison operations. Transform unsigned inputs in
3333   // a manner that allows for the use of signed comparison operations by
3334   // flipping the high order bits.
3335   if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3336       Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3337     Variable *T0 = makeReg(Ty);
3338     Variable *T1 = makeReg(Ty);
3339     Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3340     _movp(T0, Src0RM);
3341     _pxor(T0, HighOrderBits);
3342     _movp(T1, Src1RM);
3343     _pxor(T1, HighOrderBits);
3344     Src0RM = T0;
3345     Src1RM = T1;
3346   }
3347 
3348   Variable *T = makeReg(Ty);
3349   switch (Condition) {
3350   default:
3351     llvm_unreachable("unexpected condition");
3352     break;
3353   case InstIcmp::Eq: {
3354     if (llvm::isa<X86OperandMem>(Src1RM))
3355       Src1RM = legalizeToReg(Src1RM);
3356     _movp(T, Src0RM);
3357     _pcmpeq(T, Src1RM);
3358   } break;
3359   case InstIcmp::Ne: {
3360     if (llvm::isa<X86OperandMem>(Src1RM))
3361       Src1RM = legalizeToReg(Src1RM);
3362     _movp(T, Src0RM);
3363     _pcmpeq(T, Src1RM);
3364     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3365     _pxor(T, MinusOne);
3366   } break;
3367   case InstIcmp::Ugt:
3368   case InstIcmp::Sgt: {
3369     if (llvm::isa<X86OperandMem>(Src1RM))
3370       Src1RM = legalizeToReg(Src1RM);
3371     _movp(T, Src0RM);
3372     _pcmpgt(T, Src1RM);
3373   } break;
3374   case InstIcmp::Uge:
3375   case InstIcmp::Sge: {
3376     // !(Src1RM > Src0RM)
3377     if (llvm::isa<X86OperandMem>(Src0RM))
3378       Src0RM = legalizeToReg(Src0RM);
3379     _movp(T, Src1RM);
3380     _pcmpgt(T, Src0RM);
3381     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3382     _pxor(T, MinusOne);
3383   } break;
3384   case InstIcmp::Ult:
3385   case InstIcmp::Slt: {
3386     if (llvm::isa<X86OperandMem>(Src0RM))
3387       Src0RM = legalizeToReg(Src0RM);
3388     _movp(T, Src1RM);
3389     _pcmpgt(T, Src0RM);
3390   } break;
3391   case InstIcmp::Ule:
3392   case InstIcmp::Sle: {
3393     // !(Src0RM > Src1RM)
3394     if (llvm::isa<X86OperandMem>(Src1RM))
3395       Src1RM = legalizeToReg(Src1RM);
3396     _movp(T, Src0RM);
3397     _pcmpgt(T, Src1RM);
3398     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3399     _pxor(T, MinusOne);
3400   } break;
3401   }
3402 
3403   _movp(Dest, T);
3404   eliminateNextVectorSextInstruction(Dest);
3405 }
3406 
lowerIcmp64(const InstIcmp * Icmp,const Inst * Consumer)3407 void TargetX8632::lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer) {
3408   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
3409   Operand *Src0 = legalize(Icmp->getSrc(0));
3410   Operand *Src1 = legalize(Icmp->getSrc(1));
3411   Variable *Dest = Icmp->getDest();
3412   InstIcmp::ICond Condition = Icmp->getCondition();
3413   assert(static_cast<size_t>(Condition) < TableIcmp64Size);
3414   Operand *Src0LoRM = nullptr;
3415   Operand *Src0HiRM = nullptr;
3416   // Legalize the portions of Src0 that are going to be needed.
3417   if (isZero(Src1)) {
3418     switch (Condition) {
3419     default:
3420       llvm_unreachable("unexpected condition");
3421       break;
3422     // These two are not optimized, so we fall through to the general case,
3423     // which needs the upper and lower halves legalized.
3424     case InstIcmp::Sgt:
3425     case InstIcmp::Sle:
3426     // These four compare after performing an "or" of the high and low half, so
3427     // they need the upper and lower halves legalized.
3428     case InstIcmp::Eq:
3429     case InstIcmp::Ule:
3430     case InstIcmp::Ne:
3431     case InstIcmp::Ugt:
3432       Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3433     // These two test only the high half's sign bit, so they need only
3434     // the upper half legalized.
3435     case InstIcmp::Sge:
3436     case InstIcmp::Slt:
3437       Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3438       break;
3439 
3440     // These two move constants and hence need no legalization.
3441     case InstIcmp::Uge:
3442     case InstIcmp::Ult:
3443       break;
3444     }
3445   } else {
3446     Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3447     Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3448   }
3449   // Optimize comparisons with zero.
3450   if (isZero(Src1)) {
3451     Constant *SignMask = Ctx->getConstantInt32(0x80000000);
3452     Variable *Temp = nullptr;
3453     switch (Condition) {
3454     default:
3455       llvm_unreachable("unexpected condition");
3456       break;
3457     case InstIcmp::Eq:
3458     case InstIcmp::Ule:
3459       // Mov Src0HiRM first, because it was legalized most recently, and will
3460       // sometimes avoid a move before the OR.
3461       _mov(Temp, Src0HiRM);
3462       _or(Temp, Src0LoRM);
3463       Context.insert<InstFakeUse>(Temp);
3464       setccOrConsumer(CondX86::Br_e, Dest, Consumer);
3465       return;
3466     case InstIcmp::Ne:
3467     case InstIcmp::Ugt:
3468       // Mov Src0HiRM first, because it was legalized most recently, and will
3469       // sometimes avoid a move before the OR.
3470       _mov(Temp, Src0HiRM);
3471       _or(Temp, Src0LoRM);
3472       Context.insert<InstFakeUse>(Temp);
3473       setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
3474       return;
3475     case InstIcmp::Uge:
3476       movOrConsumer(true, Dest, Consumer);
3477       return;
3478     case InstIcmp::Ult:
3479       movOrConsumer(false, Dest, Consumer);
3480       return;
3481     case InstIcmp::Sgt:
3482       break;
3483     case InstIcmp::Sge:
3484       _test(Src0HiRM, SignMask);
3485       setccOrConsumer(CondX86::Br_e, Dest, Consumer);
3486       return;
3487     case InstIcmp::Slt:
3488       _test(Src0HiRM, SignMask);
3489       setccOrConsumer(CondX86::Br_ne, Dest, Consumer);
3490       return;
3491     case InstIcmp::Sle:
3492       break;
3493     }
3494   }
3495   // Handle general compares.
3496   Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
3497   Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
3498   if (Consumer == nullptr) {
3499     Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
3500     Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
3501     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3502     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3503     _mov(Dest, One);
3504     _cmp(Src0HiRM, Src1HiRI);
3505     if (TableIcmp64[Condition].C1 != CondX86::Br_None)
3506       _br(TableIcmp64[Condition].C1, LabelTrue);
3507     if (TableIcmp64[Condition].C2 != CondX86::Br_None)
3508       _br(TableIcmp64[Condition].C2, LabelFalse);
3509     _cmp(Src0LoRM, Src1LoRI);
3510     _br(TableIcmp64[Condition].C3, LabelTrue);
3511     Context.insert(LabelFalse);
3512     _redefined(_mov(Dest, Zero));
3513     Context.insert(LabelTrue);
3514     return;
3515   }
3516   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3517     _cmp(Src0HiRM, Src1HiRI);
3518     if (TableIcmp64[Condition].C1 != CondX86::Br_None)
3519       _br(TableIcmp64[Condition].C1, Br->getTargetTrue());
3520     if (TableIcmp64[Condition].C2 != CondX86::Br_None)
3521       _br(TableIcmp64[Condition].C2, Br->getTargetFalse());
3522     _cmp(Src0LoRM, Src1LoRI);
3523     _br(TableIcmp64[Condition].C3, Br->getTargetTrue(), Br->getTargetFalse());
3524     return;
3525   }
3526   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3527     Operand *SrcT = Select->getTrueOperand();
3528     Operand *SrcF = Select->getFalseOperand();
3529     Variable *SelectDest = Select->getDest();
3530     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3531     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3532     lowerMove(SelectDest, SrcT, false);
3533     _cmp(Src0HiRM, Src1HiRI);
3534     if (TableIcmp64[Condition].C1 != CondX86::Br_None)
3535       _br(TableIcmp64[Condition].C1, LabelTrue);
3536     if (TableIcmp64[Condition].C2 != CondX86::Br_None)
3537       _br(TableIcmp64[Condition].C2, LabelFalse);
3538     _cmp(Src0LoRM, Src1LoRI);
3539     _br(TableIcmp64[Condition].C3, LabelTrue);
3540     Context.insert(LabelFalse);
3541     static constexpr bool IsRedefinition = true;
3542     lowerMove(SelectDest, SrcF, IsRedefinition);
3543     Context.insert(LabelTrue);
3544     return;
3545   }
3546   llvm::report_fatal_error("Unexpected consumer type");
3547 }
3548 
setccOrConsumer(BrCond Condition,Variable * Dest,const Inst * Consumer)3549 void TargetX8632::setccOrConsumer(BrCond Condition, Variable *Dest,
3550                                   const Inst *Consumer) {
3551   if (Consumer == nullptr) {
3552     _setcc(Dest, Condition);
3553     return;
3554   }
3555   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3556     _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3557     return;
3558   }
3559   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3560     Operand *SrcT = Select->getTrueOperand();
3561     Operand *SrcF = Select->getFalseOperand();
3562     Variable *SelectDest = Select->getDest();
3563     lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3564     return;
3565   }
3566   llvm::report_fatal_error("Unexpected consumer type");
3567 }
3568 
movOrConsumer(bool IcmpResult,Variable * Dest,const Inst * Consumer)3569 void TargetX8632::movOrConsumer(bool IcmpResult, Variable *Dest,
3570                                 const Inst *Consumer) {
3571   if (Consumer == nullptr) {
3572     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3573     return;
3574   }
3575   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3576     // TODO(sehr,stichnot): This could be done with a single unconditional
3577     // branch instruction, but subzero doesn't know how to handle the resulting
3578     // control flow graph changes now.  Make it do so to eliminate mov and cmp.
3579     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3580     _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3581     _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3582     return;
3583   }
3584   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3585     Operand *Src = nullptr;
3586     if (IcmpResult) {
3587       Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3588     } else {
3589       Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3590     }
3591     Variable *SelectDest = Select->getDest();
3592     lowerMove(SelectDest, Src, false);
3593     return;
3594   }
3595   llvm::report_fatal_error("Unexpected consumer type");
3596 }
3597 
lowerArithAndConsumer(const InstArithmetic * Arith,const Inst * Consumer)3598 void TargetX8632::lowerArithAndConsumer(const InstArithmetic *Arith,
3599                                         const Inst *Consumer) {
3600   Variable *T = nullptr;
3601   Operand *Src0 = legalize(Arith->getSrc(0));
3602   Operand *Src1 = legalize(Arith->getSrc(1));
3603   Variable *Dest = Arith->getDest();
3604   switch (Arith->getOp()) {
3605   default:
3606     llvm_unreachable("arithmetic operator not AND or OR");
3607     break;
3608   case InstArithmetic::And:
3609     _mov(T, Src0);
3610     // Test cannot have an address in the second position.  Since T is
3611     // guaranteed to be a register and Src1 could be a memory load, ensure
3612     // that the second argument is a register.
3613     if (llvm::isa<Constant>(Src1))
3614       _test(T, Src1);
3615     else
3616       _test(Src1, T);
3617     break;
3618   case InstArithmetic::Or:
3619     _mov(T, Src0);
3620     _or(T, Src1);
3621     break;
3622   }
3623 
3624   if (Consumer == nullptr) {
3625     llvm::report_fatal_error("Expected a consumer instruction");
3626   }
3627   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3628     Context.insert<InstFakeUse>(T);
3629     Context.insert<InstFakeDef>(Dest);
3630     _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3631     return;
3632   }
3633   llvm::report_fatal_error("Unexpected consumer type");
3634 }
3635 
lowerInsertElement(const InstInsertElement * Instr)3636 void TargetX8632::lowerInsertElement(const InstInsertElement *Instr) {
3637   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3638   Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3639   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3640   // Only constant indices are allowed in PNaCl IR.
3641   assert(ElementIndex);
3642   unsigned Index = ElementIndex->getValue();
3643   assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3644 
3645   Type Ty = SourceVectNotLegalized->getType();
3646   Type ElementTy = typeElementType(Ty);
3647   Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
3648 
3649   if (ElementTy == IceType_i1) {
3650     // Expand the element to the appropriate size for it to be inserted in the
3651     // vector.
3652     Variable *Expanded = Func->makeVariable(InVectorElementTy);
3653     auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3654                                   ElementToInsertNotLegalized);
3655     lowerCast(Cast);
3656     ElementToInsertNotLegalized = Expanded;
3657   }
3658 
3659   if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
3660     // Use insertps, pinsrb, pinsrw, or pinsrd.
3661     Operand *ElementRM =
3662         legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3663     Operand *SourceVectRM =
3664         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3665     Variable *T = makeReg(Ty);
3666     _movp(T, SourceVectRM);
3667     if (Ty == IceType_v4f32) {
3668       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
3669     } else {
3670       // For the pinsrb and pinsrw instructions, when the source operand is a
3671       // register, it must be a full r32 register like eax, and not ax/al/ah.
3672       // For filetype=asm, InstX86Pinsr::emit() compensates for
3673       // the use
3674       // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
3675       // validates that the original and base register encodings are the same.
3676       if (ElementRM->getType() == IceType_i8 &&
3677           llvm::isa<Variable>(ElementRM)) {
3678         // Don't use ah/bh/ch/dh for pinsrb.
3679         ElementRM = copyToReg8(ElementRM);
3680       }
3681       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
3682     }
3683     _movp(Instr->getDest(), T);
3684   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3685     // Use shufps or movss.
3686     Variable *ElementR = nullptr;
3687     Operand *SourceVectRM =
3688         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3689 
3690     if (InVectorElementTy == IceType_f32) {
3691       // ElementR will be in an XMM register since it is floating point.
3692       ElementR = legalizeToReg(ElementToInsertNotLegalized);
3693     } else {
3694       // Copy an integer to an XMM register.
3695       Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3696       ElementR = makeReg(Ty);
3697       _movd(ElementR, T);
3698     }
3699 
3700     if (Index == 0) {
3701       Variable *T = makeReg(Ty);
3702       _movp(T, SourceVectRM);
3703       _movss(T, ElementR);
3704       _movp(Instr->getDest(), T);
3705       return;
3706     }
3707 
3708     // shufps treats the source and destination operands as vectors of four
3709     // doublewords. The destination's two high doublewords are selected from
3710     // the source operand and the two low doublewords are selected from the
3711     // (original value of) the destination operand. An insertelement operation
3712     // can be effected with a sequence of two shufps operations with
3713     // appropriate masks. In all cases below, Element[0] is being inserted into
3714     // SourceVectOperand. Indices are ordered from left to right.
3715     //
3716     // insertelement into index 1 (result is stored in ElementR):
3717     //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
3718     //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
3719     //
3720     // insertelement into index 2 (result is stored in T):
3721     //   T := SourceVectRM
3722     //   ElementR := ElementR[0, 0] T[0, 3]
3723     //   T := T[0, 1] ElementR[0, 3]
3724     //
3725     // insertelement into index 3 (result is stored in T):
3726     //   T := SourceVectRM
3727     //   ElementR := ElementR[0, 0] T[0, 2]
3728     //   T := T[0, 1] ElementR[3, 0]
3729     const unsigned char Mask1[3] = {0, 192, 128};
3730     const unsigned char Mask2[3] = {227, 196, 52};
3731 
3732     Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
3733     Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
3734 
3735     if (Index == 1) {
3736       _shufps(ElementR, SourceVectRM, Mask1Constant);
3737       _shufps(ElementR, SourceVectRM, Mask2Constant);
3738       _movp(Instr->getDest(), ElementR);
3739     } else {
3740       Variable *T = makeReg(Ty);
3741       _movp(T, SourceVectRM);
3742       _shufps(ElementR, T, Mask1Constant);
3743       _shufps(T, ElementR, Mask2Constant);
3744       _movp(Instr->getDest(), T);
3745     }
3746   } else {
3747     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3748     // Spill the value to a stack slot and perform the insertion in memory.
3749     //
3750     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3751     // for legalizing to mem is implemented.
3752     Variable *Slot = Func->makeVariable(Ty);
3753     Slot->setMustNotHaveReg();
3754     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3755 
3756     // Compute the location of the position to insert in memory.
3757     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3758     X86OperandMem *Loc =
3759         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3760     _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
3761 
3762     Variable *T = makeReg(Ty);
3763     _movp(T, Slot);
3764     _movp(Instr->getDest(), T);
3765   }
3766 }
3767 
lowerIntrinsic(const InstIntrinsic * Instr)3768 void TargetX8632::lowerIntrinsic(const InstIntrinsic *Instr) {
3769   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
3770   case Intrinsics::AtomicCmpxchg: {
3771     if (!Intrinsics::isMemoryOrderValid(
3772             ID, getConstantMemoryOrder(Instr->getArg(3)),
3773             getConstantMemoryOrder(Instr->getArg(4)))) {
3774       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
3775       return;
3776     }
3777     Variable *DestPrev = Instr->getDest();
3778     Operand *PtrToMem = legalize(Instr->getArg(0));
3779     Operand *Expected = legalize(Instr->getArg(1));
3780     Operand *Desired = legalize(Instr->getArg(2));
3781     if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
3782       return;
3783     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
3784     return;
3785   }
3786   case Intrinsics::AtomicFence:
3787     if (!Intrinsics::isMemoryOrderValid(
3788             ID, getConstantMemoryOrder(Instr->getArg(0)))) {
3789       Func->setError("Unexpected memory ordering for AtomicFence");
3790       return;
3791     }
3792     _mfence();
3793     return;
3794   case Intrinsics::AtomicFenceAll:
3795     // NOTE: FenceAll should prevent and load/store from being moved across the
3796     // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
3797     // currently marked coarsely as "HasSideEffects".
3798     _mfence();
3799     return;
3800   case Intrinsics::AtomicIsLockFree: {
3801     // X86 is always lock free for 8/16/32/64 bit accesses.
3802     // TODO(jvoung): Since the result is constant when given a constant byte
3803     // size, this opens up DCE opportunities.
3804     Operand *ByteSize = Instr->getArg(0);
3805     Variable *Dest = Instr->getDest();
3806     if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
3807       Constant *Result;
3808       switch (CI->getValue()) {
3809       default:
3810         // Some x86-64 processors support the cmpxchg16b instruction, which can
3811         // make 16-byte operations lock free (when used with the LOCK prefix).
3812         // However, that's not supported in 32-bit mode, so just return 0 even
3813         // for large sizes.
3814         Result = Ctx->getConstantZero(IceType_i32);
3815         break;
3816       case 1:
3817       case 2:
3818       case 4:
3819       case 8:
3820         Result = Ctx->getConstantInt32(1);
3821         break;
3822       }
3823       _mov(Dest, Result);
3824       return;
3825     }
3826     // The PNaCl ABI requires the byte size to be a compile-time constant.
3827     Func->setError("AtomicIsLockFree byte size should be compile-time const");
3828     return;
3829   }
3830   case Intrinsics::AtomicLoad: {
3831     // We require the memory address to be naturally aligned. Given that is the
3832     // case, then normal loads are atomic.
3833     if (!Intrinsics::isMemoryOrderValid(
3834             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
3835       Func->setError("Unexpected memory ordering for AtomicLoad");
3836       return;
3837     }
3838     Variable *Dest = Instr->getDest();
3839     if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
3840       // Follow what GCC does and use a movq instead of what lowerLoad()
3841       // normally does (split the load into two). Thus, this skips
3842       // load/arithmetic op folding. Load/arithmetic folding can't happen
3843       // anyway, since this is x86-32 and integer arithmetic only happens on
3844       // 32-bit quantities.
3845       Variable *T = makeReg(IceType_f64);
3846       X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
3847       _movq(T, Addr);
3848       // Then cast the bits back out of the XMM register to the i64 Dest.
3849       auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
3850       lowerCast(Cast);
3851       // Make sure that the atomic load isn't elided when unused.
3852       Context.insert<InstFakeUse>(Dest64On32->getLo());
3853       Context.insert<InstFakeUse>(Dest64On32->getHi());
3854       return;
3855     }
3856     auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
3857     lowerLoad(Load);
3858     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
3859     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
3860     // the FakeUse on the last-inserted instruction's dest.
3861     Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
3862     return;
3863   }
3864   case Intrinsics::AtomicRMW:
3865     if (!Intrinsics::isMemoryOrderValid(
3866             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
3867       Func->setError("Unexpected memory ordering for AtomicRMW");
3868       return;
3869     }
3870     lowerAtomicRMW(
3871         Instr->getDest(),
3872         static_cast<uint32_t>(
3873             llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
3874         Instr->getArg(1), Instr->getArg(2));
3875     return;
3876   case Intrinsics::AtomicStore: {
3877     if (!Intrinsics::isMemoryOrderValid(
3878             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
3879       Func->setError("Unexpected memory ordering for AtomicStore");
3880       return;
3881     }
3882     // We require the memory address to be naturally aligned. Given that is the
3883     // case, then normal stores are atomic. Add a fence after the store to make
3884     // it visible.
3885     Operand *Value = Instr->getArg(0);
3886     Operand *Ptr = Instr->getArg(1);
3887     if (Value->getType() == IceType_i64) {
3888       // Use a movq instead of what lowerStore() normally does (split the store
3889       // into two), following what GCC does. Cast the bits from int -> to an
3890       // xmm register first.
3891       Variable *T = makeReg(IceType_f64);
3892       auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
3893       lowerCast(Cast);
3894       // Then store XMM w/ a movq.
3895       X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
3896       _storeq(T, Addr);
3897       _mfence();
3898       return;
3899     }
3900     auto *Store = InstStore::create(Func, Value, Ptr);
3901     lowerStore(Store);
3902     _mfence();
3903     return;
3904   }
3905   case Intrinsics::Bswap: {
3906     Variable *Dest = Instr->getDest();
3907     Operand *Val = Instr->getArg(0);
3908     // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
3909     // must be a register. Use rotate left for 16-bit bswap.
3910     if (Val->getType() == IceType_i64) {
3911       Val = legalizeUndef(Val);
3912       Variable *T_Lo = legalizeToReg(loOperand(Val));
3913       Variable *T_Hi = legalizeToReg(hiOperand(Val));
3914       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
3915       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3916       _bswap(T_Lo);
3917       _bswap(T_Hi);
3918       _mov(DestLo, T_Hi);
3919       _mov(DestHi, T_Lo);
3920     } else if (Val->getType() == IceType_i32) {
3921       Variable *T = legalizeToReg(Val);
3922       _bswap(T);
3923       _mov(Dest, T);
3924     } else {
3925       assert(Val->getType() == IceType_i16);
3926       Constant *Eight = Ctx->getConstantInt16(8);
3927       Variable *T = nullptr;
3928       Val = legalize(Val);
3929       _mov(T, Val);
3930       _rol(T, Eight);
3931       _mov(Dest, T);
3932     }
3933     return;
3934   }
3935   case Intrinsics::Ctpop: {
3936     Variable *Dest = Instr->getDest();
3937     Operand *Val = Instr->getArg(0);
3938     Type ValTy = Val->getType();
3939     assert(ValTy == IceType_i32 || ValTy == IceType_i64);
3940 
3941     InstCall *Call =
3942         makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
3943                                             : RuntimeHelper::H_call_ctpop_i64,
3944                        Dest, 1);
3945     Call->addArg(Val);
3946     lowerCall(Call);
3947     // The popcount helpers always return 32-bit values, while the intrinsic's
3948     // signature matches the native POPCNT instruction and fills a 64-bit reg
3949     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
3950     // the user doesn't do that in the IR. If the user does that in the IR,
3951     // then this zero'ing instruction is dead and gets optimized out.
3952     if (Val->getType() == IceType_i64) {
3953       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3954       Constant *Zero = Ctx->getConstantZero(IceType_i32);
3955       _mov(DestHi, Zero);
3956     }
3957     return;
3958   }
3959   case Intrinsics::Ctlz: {
3960     // The "is zero undef" parameter is ignored and we always return a
3961     // well-defined value.
3962     Operand *Val = legalize(Instr->getArg(0));
3963     Operand *FirstVal;
3964     Operand *SecondVal = nullptr;
3965     if (Val->getType() == IceType_i64) {
3966       FirstVal = loOperand(Val);
3967       SecondVal = hiOperand(Val);
3968     } else {
3969       FirstVal = Val;
3970     }
3971     constexpr bool IsCttz = false;
3972     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3973                     SecondVal);
3974     return;
3975   }
3976   case Intrinsics::Cttz: {
3977     // The "is zero undef" parameter is ignored and we always return a
3978     // well-defined value.
3979     Operand *Val = legalize(Instr->getArg(0));
3980     Operand *FirstVal;
3981     Operand *SecondVal = nullptr;
3982     if (Val->getType() == IceType_i64) {
3983       FirstVal = hiOperand(Val);
3984       SecondVal = loOperand(Val);
3985     } else {
3986       FirstVal = Val;
3987     }
3988     constexpr bool IsCttz = true;
3989     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3990                     SecondVal);
3991     return;
3992   }
3993   case Intrinsics::Fabs: {
3994     Operand *Src = legalize(Instr->getArg(0));
3995     Type Ty = Src->getType();
3996     Variable *Dest = Instr->getDest();
3997     Variable *T = makeVectorOfFabsMask(Ty);
3998     // The pand instruction operates on an m128 memory operand, so if Src is an
3999     // f32 or f64, we need to make sure it's in a register.
4000     if (isVectorType(Ty)) {
4001       if (llvm::isa<X86OperandMem>(Src))
4002         Src = legalizeToReg(Src);
4003     } else {
4004       Src = legalizeToReg(Src);
4005     }
4006     _pand(T, Src);
4007     if (isVectorType(Ty))
4008       _movp(Dest, T);
4009     else
4010       _mov(Dest, T);
4011     return;
4012   }
4013   case Intrinsics::Longjmp: {
4014     InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
4015     Call->addArg(Instr->getArg(0));
4016     Call->addArg(Instr->getArg(1));
4017     lowerCall(Call);
4018     return;
4019   }
4020   case Intrinsics::Memcpy: {
4021     lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4022     return;
4023   }
4024   case Intrinsics::Memmove: {
4025     lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4026     return;
4027   }
4028   case Intrinsics::Memset: {
4029     lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4030     return;
4031   }
4032   case Intrinsics::Setjmp: {
4033     InstCall *Call =
4034         makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
4035     Call->addArg(Instr->getArg(0));
4036     lowerCall(Call);
4037     return;
4038   }
4039   case Intrinsics::Sqrt: {
4040     Operand *Src = legalize(Instr->getArg(0));
4041     Variable *Dest = Instr->getDest();
4042     Variable *T = makeReg(Dest->getType());
4043     _sqrt(T, Src);
4044     if (isVectorType(Dest->getType())) {
4045       _movp(Dest, T);
4046     } else {
4047       _mov(Dest, T);
4048     }
4049     return;
4050   }
4051   case Intrinsics::Stacksave: {
4052     Variable *esp =
4053         Func->getTarget()->getPhysicalRegister(getStackReg(), WordType);
4054     Variable *Dest = Instr->getDest();
4055     _mov(Dest, esp);
4056     return;
4057   }
4058   case Intrinsics::Stackrestore: {
4059     Operand *Src = Instr->getArg(0);
4060     _mov_sp(Src);
4061     return;
4062   }
4063 
4064   case Intrinsics::Trap:
4065     _ud2();
4066     return;
4067   case Intrinsics::LoadSubVector: {
4068     assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
4069            "LoadSubVector second argument must be a constant");
4070     Variable *Dest = Instr->getDest();
4071     Type Ty = Dest->getType();
4072     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
4073     Operand *Addr = Instr->getArg(0);
4074     X86OperandMem *Src = formMemoryOperand(Addr, Ty);
4075     doMockBoundsCheck(Src);
4076 
4077     if (Dest->isRematerializable()) {
4078       Context.insert<InstFakeDef>(Dest);
4079       return;
4080     }
4081 
4082     auto *T = makeReg(Ty);
4083     switch (SubVectorSize->getValue()) {
4084     case 4:
4085       _movd(T, Src);
4086       break;
4087     case 8:
4088       _movq(T, Src);
4089       break;
4090     default:
4091       Func->setError("Unexpected size for LoadSubVector");
4092       return;
4093     }
4094     _movp(Dest, T);
4095     return;
4096   }
4097   case Intrinsics::StoreSubVector: {
4098     assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
4099            "StoreSubVector third argument must be a constant");
4100     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
4101     Operand *Value = Instr->getArg(0);
4102     Operand *Addr = Instr->getArg(1);
4103     X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
4104     doMockBoundsCheck(NewAddr);
4105 
4106     Value = legalizeToReg(Value);
4107 
4108     switch (SubVectorSize->getValue()) {
4109     case 4:
4110       _stored(Value, NewAddr);
4111       break;
4112     case 8:
4113       _storeq(Value, NewAddr);
4114       break;
4115     default:
4116       Func->setError("Unexpected size for StoreSubVector");
4117       return;
4118     }
4119     return;
4120   }
4121   case Intrinsics::VectorPackSigned: {
4122     Operand *Src0 = Instr->getArg(0);
4123     Operand *Src1 = Instr->getArg(1);
4124     Variable *Dest = Instr->getDest();
4125     auto *T = makeReg(Src0->getType());
4126     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4127     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4128     _movp(T, Src0RM);
4129     _packss(T, Src1RM);
4130     _movp(Dest, T);
4131     return;
4132   }
4133   case Intrinsics::VectorPackUnsigned: {
4134     Operand *Src0 = Instr->getArg(0);
4135     Operand *Src1 = Instr->getArg(1);
4136     Variable *Dest = Instr->getDest();
4137     auto *T = makeReg(Src0->getType());
4138     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4139     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4140     _movp(T, Src0RM);
4141     _packus(T, Src1RM);
4142     _movp(Dest, T);
4143     return;
4144   }
4145   case Intrinsics::SignMask: {
4146     Operand *SrcReg = legalizeToReg(Instr->getArg(0));
4147     Variable *Dest = Instr->getDest();
4148     Variable *T = makeReg(IceType_i32);
4149     if (SrcReg->getType() == IceType_v4f32 ||
4150         SrcReg->getType() == IceType_v4i32 ||
4151         SrcReg->getType() == IceType_v16i8) {
4152       _movmsk(T, SrcReg);
4153     } else {
4154       // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
4155       llvm::report_fatal_error("Invalid type for SignMask intrinsic");
4156     }
4157     _mov(Dest, T);
4158     return;
4159   }
4160   case Intrinsics::MultiplyHighSigned: {
4161     Operand *Src0 = Instr->getArg(0);
4162     Operand *Src1 = Instr->getArg(1);
4163     Variable *Dest = Instr->getDest();
4164     auto *T = makeReg(Dest->getType());
4165     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4166     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4167     _movp(T, Src0RM);
4168     _pmulhw(T, Src1RM);
4169     _movp(Dest, T);
4170     return;
4171   }
4172   case Intrinsics::MultiplyHighUnsigned: {
4173     Operand *Src0 = Instr->getArg(0);
4174     Operand *Src1 = Instr->getArg(1);
4175     Variable *Dest = Instr->getDest();
4176     auto *T = makeReg(Dest->getType());
4177     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4178     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4179     _movp(T, Src0RM);
4180     _pmulhuw(T, Src1RM);
4181     _movp(Dest, T);
4182     return;
4183   }
4184   case Intrinsics::MultiplyAddPairs: {
4185     Operand *Src0 = Instr->getArg(0);
4186     Operand *Src1 = Instr->getArg(1);
4187     Variable *Dest = Instr->getDest();
4188     auto *T = makeReg(Dest->getType());
4189     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4190     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4191     _movp(T, Src0RM);
4192     _pmaddwd(T, Src1RM);
4193     _movp(Dest, T);
4194     return;
4195   }
4196   case Intrinsics::AddSaturateSigned: {
4197     Operand *Src0 = Instr->getArg(0);
4198     Operand *Src1 = Instr->getArg(1);
4199     Variable *Dest = Instr->getDest();
4200     auto *T = makeReg(Dest->getType());
4201     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4202     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4203     _movp(T, Src0RM);
4204     _padds(T, Src1RM);
4205     _movp(Dest, T);
4206     return;
4207   }
4208   case Intrinsics::SubtractSaturateSigned: {
4209     Operand *Src0 = Instr->getArg(0);
4210     Operand *Src1 = Instr->getArg(1);
4211     Variable *Dest = Instr->getDest();
4212     auto *T = makeReg(Dest->getType());
4213     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4214     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4215     _movp(T, Src0RM);
4216     _psubs(T, Src1RM);
4217     _movp(Dest, T);
4218     return;
4219   }
4220   case Intrinsics::AddSaturateUnsigned: {
4221     Operand *Src0 = Instr->getArg(0);
4222     Operand *Src1 = Instr->getArg(1);
4223     Variable *Dest = Instr->getDest();
4224     auto *T = makeReg(Dest->getType());
4225     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4226     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4227     _movp(T, Src0RM);
4228     _paddus(T, Src1RM);
4229     _movp(Dest, T);
4230     return;
4231   }
4232   case Intrinsics::SubtractSaturateUnsigned: {
4233     Operand *Src0 = Instr->getArg(0);
4234     Operand *Src1 = Instr->getArg(1);
4235     Variable *Dest = Instr->getDest();
4236     auto *T = makeReg(Dest->getType());
4237     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4238     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4239     _movp(T, Src0RM);
4240     _psubus(T, Src1RM);
4241     _movp(Dest, T);
4242     return;
4243   }
4244   case Intrinsics::Nearbyint: {
4245     Operand *Src = Instr->getArg(0);
4246     Variable *Dest = Instr->getDest();
4247     Type DestTy = Dest->getType();
4248     if (isVectorType(DestTy)) {
4249       assert(DestTy == IceType_v4i32);
4250       assert(Src->getType() == IceType_v4f32);
4251       Operand *Src0R = legalizeToReg(Src);
4252       Variable *T = makeReg(DestTy);
4253       _cvt(T, Src0R, Insts::Cvt::Ps2dq);
4254       _movp(Dest, T);
4255     } else if (DestTy == IceType_i64) {
4256       llvm::report_fatal_error("Helper call was expected");
4257     } else {
4258       Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
4259       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
4260       assert(DestTy != IceType_i64);
4261       Variable *T_1 = makeReg(IceType_i32);
4262       // cvt() requires its integer argument to be a GPR.
4263       Variable *T_2 = makeReg(DestTy);
4264       if (isByteSizedType(DestTy)) {
4265         assert(T_1->getType() == IceType_i32);
4266         T_1->setRegClass(RCX86_Is32To8);
4267         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
4268       }
4269       _cvt(T_1, Src0RM, Insts::Cvt::Ss2si);
4270       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
4271       if (DestTy == IceType_i1)
4272         _and(T_2, Ctx->getConstantInt1(1));
4273       _mov(Dest, T_2);
4274     }
4275     return;
4276   }
4277   case Intrinsics::Round: {
4278     assert(InstructionSet >= SSE4_1);
4279     Variable *Dest = Instr->getDest();
4280     Operand *Src = Instr->getArg(0);
4281     Operand *Mode = Instr->getArg(1);
4282     assert(llvm::isa<ConstantInteger32>(Mode) &&
4283            "Round last argument must be a constant");
4284     auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
4285     int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
4286     (void)Imm;
4287     assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
4288     auto *T = makeReg(Dest->getType());
4289     _round(T, SrcRM, Mode);
4290     _movp(Dest, T);
4291     return;
4292   }
4293   default: // UnknownIntrinsic
4294     Func->setError("Unexpected intrinsic");
4295     return;
4296   }
4297   return;
4298 }
4299 
lowerAtomicCmpxchg(Variable * DestPrev,Operand * Ptr,Operand * Expected,Operand * Desired)4300 void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
4301                                      Operand *Expected, Operand *Desired) {
4302   Type Ty = Expected->getType();
4303   if (Ty == IceType_i64) {
4304     // Reserve the pre-colored registers first, before adding any more
4305     // infinite-weight variables from formMemoryOperand's legalization.
4306     Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
4307     Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
4308     Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
4309     Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
4310     _mov(T_eax, loOperand(Expected));
4311     _mov(T_edx, hiOperand(Expected));
4312     _mov(T_ebx, loOperand(Desired));
4313     _mov(T_ecx, hiOperand(Desired));
4314     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4315     constexpr bool Locked = true;
4316     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4317     auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
4318     auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
4319     _mov(DestLo, T_eax);
4320     _mov(DestHi, T_edx);
4321     return;
4322   }
4323   RegNumT Eax;
4324   switch (Ty) {
4325   default:
4326     llvm::report_fatal_error("Bad type for cmpxchg");
4327   case IceType_i32:
4328     Eax = RegX8632::Reg_eax;
4329     break;
4330   case IceType_i16:
4331     Eax = RegX8632::Reg_ax;
4332     break;
4333   case IceType_i8:
4334     Eax = RegX8632::Reg_al;
4335     break;
4336   }
4337   Variable *T_eax = makeReg(Ty, Eax);
4338   _mov(T_eax, Expected);
4339   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4340   Variable *DesiredReg = legalizeToReg(Desired);
4341   constexpr bool Locked = true;
4342   _cmpxchg(Addr, T_eax, DesiredReg, Locked);
4343   _mov(DestPrev, T_eax);
4344 }
4345 
tryOptimizedCmpxchgCmpBr(Variable * Dest,Operand * PtrToMem,Operand * Expected,Operand * Desired)4346 bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
4347                                            Operand *Expected,
4348                                            Operand *Desired) {
4349   if (Func->getOptLevel() == Opt_m1)
4350     return false;
4351   // Peek ahead a few instructions and see how Dest is used.
4352   // It's very common to have:
4353   //
4354   // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
4355   // [%y_phi = ...] // list of phi stores
4356   // %p = icmp eq i32 %x, %expected
4357   // br i1 %p, label %l1, label %l2
4358   //
4359   // which we can optimize into:
4360   //
4361   // %x = <cmpxchg code>
4362   // [%y_phi = ...] // list of phi stores
4363   // br eq, %l1, %l2
4364   InstList::iterator I = Context.getCur();
4365   // I is currently the InstIntrinsic. Peek past that.
4366   // This assumes that the atomic cmpxchg has not been lowered yet,
4367   // so that the instructions seen in the scan from "Cur" is simple.
4368   assert(llvm::isa<InstIntrinsic>(*I));
4369   Inst *NextInst = Context.getNextInst(I);
4370   if (!NextInst)
4371     return false;
4372   // There might be phi assignments right before the compare+branch, since
4373   // this could be a backward branch for a loop. This placement of assignments
4374   // is determined by placePhiStores().
4375   CfgVector<InstAssign *> PhiAssigns;
4376   while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
4377     if (PhiAssign->getDest() == Dest)
4378       return false;
4379     PhiAssigns.push_back(PhiAssign);
4380     NextInst = Context.getNextInst(I);
4381     if (!NextInst)
4382       return false;
4383   }
4384   if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
4385     if (!(NextCmp->getCondition() == InstIcmp::Eq &&
4386           ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
4387            (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
4388       return false;
4389     }
4390     NextInst = Context.getNextInst(I);
4391     if (!NextInst)
4392       return false;
4393     if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
4394       if (!NextBr->isUnconditional() &&
4395           NextCmp->getDest() == NextBr->getCondition() &&
4396           NextBr->isLastUse(NextCmp->getDest())) {
4397         lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
4398         for (size_t i = 0; i < PhiAssigns.size(); ++i) {
4399           // Lower the phi assignments now, before the branch (same placement
4400           // as before).
4401           InstAssign *PhiAssign = PhiAssigns[i];
4402           PhiAssign->setDeleted();
4403           lowerAssign(PhiAssign);
4404           Context.advanceNext();
4405         }
4406         _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
4407         // Skip over the old compare and branch, by deleting them.
4408         NextCmp->setDeleted();
4409         NextBr->setDeleted();
4410         Context.advanceNext();
4411         Context.advanceNext();
4412         return true;
4413       }
4414     }
4415   }
4416   return false;
4417 }
4418 
lowerAtomicRMW(Variable * Dest,uint32_t Operation,Operand * Ptr,Operand * Val)4419 void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
4420                                  Operand *Ptr, Operand *Val) {
4421   bool NeedsCmpxchg = false;
4422   LowerBinOp Op_Lo = nullptr;
4423   LowerBinOp Op_Hi = nullptr;
4424   switch (Operation) {
4425   default:
4426     Func->setError("Unknown AtomicRMW operation");
4427     return;
4428   case Intrinsics::AtomicAdd: {
4429     if (Dest->getType() == IceType_i64) {
4430       // All the fall-through paths must set this to true, but use this
4431       // for asserting.
4432       NeedsCmpxchg = true;
4433       Op_Lo = &TargetX8632::_add;
4434       Op_Hi = &TargetX8632::_adc;
4435       break;
4436     }
4437     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4438     constexpr bool Locked = true;
4439     Variable *T = nullptr;
4440     _mov(T, Val);
4441     _xadd(Addr, T, Locked);
4442     _mov(Dest, T);
4443     return;
4444   }
4445   case Intrinsics::AtomicSub: {
4446     if (Dest->getType() == IceType_i64) {
4447       NeedsCmpxchg = true;
4448       Op_Lo = &TargetX8632::_sub;
4449       Op_Hi = &TargetX8632::_sbb;
4450       break;
4451     }
4452     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4453     constexpr bool Locked = true;
4454     Variable *T = nullptr;
4455     _mov(T, Val);
4456     _neg(T);
4457     _xadd(Addr, T, Locked);
4458     _mov(Dest, T);
4459     return;
4460   }
4461   case Intrinsics::AtomicOr:
4462     // TODO(jvoung): If Dest is null or dead, then some of these
4463     // operations do not need an "exchange", but just a locked op.
4464     // That appears to be "worth" it for sub, or, and, and xor.
4465     // xadd is probably fine vs lock add for add, and xchg is fine
4466     // vs an atomic store.
4467     NeedsCmpxchg = true;
4468     Op_Lo = &TargetX8632::_or;
4469     Op_Hi = &TargetX8632::_or;
4470     break;
4471   case Intrinsics::AtomicAnd:
4472     NeedsCmpxchg = true;
4473     Op_Lo = &TargetX8632::_and;
4474     Op_Hi = &TargetX8632::_and;
4475     break;
4476   case Intrinsics::AtomicXor:
4477     NeedsCmpxchg = true;
4478     Op_Lo = &TargetX8632::_xor;
4479     Op_Hi = &TargetX8632::_xor;
4480     break;
4481   case Intrinsics::AtomicExchange:
4482     if (Dest->getType() == IceType_i64) {
4483       NeedsCmpxchg = true;
4484       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
4485       // just need to be moved to the ecx and ebx registers.
4486       Op_Lo = nullptr;
4487       Op_Hi = nullptr;
4488       break;
4489     }
4490     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4491     Variable *T = nullptr;
4492     _mov(T, Val);
4493     _xchg(Addr, T);
4494     _mov(Dest, T);
4495     return;
4496   }
4497   // Otherwise, we need a cmpxchg loop.
4498   (void)NeedsCmpxchg;
4499   assert(NeedsCmpxchg);
4500   expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
4501 }
4502 
expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,LowerBinOp Op_Hi,Variable * Dest,Operand * Ptr,Operand * Val)4503 void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
4504                                            Variable *Dest, Operand *Ptr,
4505                                            Operand *Val) {
4506   // Expand a more complex RMW operation as a cmpxchg loop:
4507   // For 64-bit:
4508   //   mov     eax, [ptr]
4509   //   mov     edx, [ptr + 4]
4510   // .LABEL:
4511   //   mov     ebx, eax
4512   //   <Op_Lo> ebx, <desired_adj_lo>
4513   //   mov     ecx, edx
4514   //   <Op_Hi> ecx, <desired_adj_hi>
4515   //   lock cmpxchg8b [ptr]
4516   //   jne     .LABEL
4517   //   mov     <dest_lo>, eax
4518   //   mov     <dest_lo>, edx
4519   //
4520   // For 32-bit:
4521   //   mov     eax, [ptr]
4522   // .LABEL:
4523   //   mov     <reg>, eax
4524   //   op      <reg>, [desired_adj]
4525   //   lock cmpxchg [ptr], <reg>
4526   //   jne     .LABEL
4527   //   mov     <dest>, eax
4528   //
4529   // If Op_{Lo,Hi} are nullptr, then just copy the value.
4530   Val = legalize(Val);
4531   Type Ty = Val->getType();
4532   if (Ty == IceType_i64) {
4533     Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
4534     Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
4535     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4536     _mov(T_eax, loOperand(Addr));
4537     _mov(T_edx, hiOperand(Addr));
4538     Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
4539     Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
4540     InstX86Label *Label = InstX86Label::create(Func, this);
4541     const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
4542     if (!IsXchg8b) {
4543       Context.insert(Label);
4544       _mov(T_ebx, T_eax);
4545       (this->*Op_Lo)(T_ebx, loOperand(Val));
4546       _mov(T_ecx, T_edx);
4547       (this->*Op_Hi)(T_ecx, hiOperand(Val));
4548     } else {
4549       // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
4550       // It just needs the Val loaded into ebx and ecx.
4551       // That can also be done before the loop.
4552       _mov(T_ebx, loOperand(Val));
4553       _mov(T_ecx, hiOperand(Val));
4554       Context.insert(Label);
4555     }
4556     constexpr bool Locked = true;
4557     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4558     _br(CondX86::Br_ne, Label);
4559     if (!IsXchg8b) {
4560       // If Val is a variable, model the extended live range of Val through
4561       // the end of the loop, since it will be re-used by the loop.
4562       if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4563         auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
4564         auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
4565         Context.insert<InstFakeUse>(ValLo);
4566         Context.insert<InstFakeUse>(ValHi);
4567       }
4568     } else {
4569       // For xchg, the loop is slightly smaller and ebx/ecx are used.
4570       Context.insert<InstFakeUse>(T_ebx);
4571       Context.insert<InstFakeUse>(T_ecx);
4572     }
4573     // The address base (if any) is also reused in the loop.
4574     if (Variable *Base = Addr->getBase())
4575       Context.insert<InstFakeUse>(Base);
4576     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4577     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4578     _mov(DestLo, T_eax);
4579     _mov(DestHi, T_edx);
4580     return;
4581   }
4582   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4583   RegNumT Eax;
4584   switch (Ty) {
4585   default:
4586     llvm::report_fatal_error("Bad type for atomicRMW");
4587   case IceType_i32:
4588     Eax = RegX8632::Reg_eax;
4589     break;
4590   case IceType_i16:
4591     Eax = RegX8632::Reg_ax;
4592     break;
4593   case IceType_i8:
4594     Eax = RegX8632::Reg_al;
4595     break;
4596   }
4597   Variable *T_eax = makeReg(Ty, Eax);
4598   _mov(T_eax, Addr);
4599   auto *Label = Context.insert<InstX86Label>(this);
4600   // We want to pick a different register for T than Eax, so don't use
4601   // _mov(T == nullptr, T_eax).
4602   Variable *T = makeReg(Ty);
4603   _mov(T, T_eax);
4604   (this->*Op_Lo)(T, Val);
4605   constexpr bool Locked = true;
4606   _cmpxchg(Addr, T_eax, T, Locked);
4607   _br(CondX86::Br_ne, Label);
4608   // If Val is a variable, model the extended live range of Val through
4609   // the end of the loop, since it will be re-used by the loop.
4610   if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4611     Context.insert<InstFakeUse>(ValVar);
4612   }
4613   // The address base (if any) is also reused in the loop.
4614   if (Variable *Base = Addr->getBase())
4615     Context.insert<InstFakeUse>(Base);
4616   _mov(Dest, T_eax);
4617 }
4618 
4619 /// Lowers count {trailing, leading} zeros intrinsic.
4620 ///
4621 /// We could do constant folding here, but that should have
4622 /// been done by the front-end/middle-end optimizations.
4623 
lowerCountZeros(bool Cttz,Type Ty,Variable * Dest,Operand * FirstVal,Operand * SecondVal)4624 void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
4625                                   Operand *FirstVal, Operand *SecondVal) {
4626   // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
4627   // Then the instructions will handle the Val == 0 case much more simply
4628   // and won't require conversion from bit position to number of zeros.
4629   //
4630   // Otherwise:
4631   //   bsr IF_NOT_ZERO, Val
4632   //   mov T_DEST, ((Ty == i32) ? 63 : 127)
4633   //   cmovne T_DEST, IF_NOT_ZERO
4634   //   xor T_DEST, ((Ty == i32) ? 31 : 63)
4635   //   mov DEST, T_DEST
4636   //
4637   // NOTE: T_DEST must be a register because cmov requires its dest to be a
4638   // register. Also, bsf and bsr require their dest to be a register.
4639   //
4640   // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
4641   // E.g., for 000... 00001100, bsr will say that the most significant bit
4642   // set is at position 3, while the number of leading zeros is 28. Xor is
4643   // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
4644   // all-zeros case).
4645   //
4646   // X8632 only: Similar for 64-bit, but start w/ speculating that the upper
4647   // 32 bits are all zero, and compute the result for that case (checking the
4648   // lower 32 bits). Then actually compute the result for the upper bits and
4649   // cmov in the result from the lower computation if the earlier speculation
4650   // was correct.
4651   //
4652   // Cttz, is similar, but uses bsf instead, and doesn't require the xor
4653   // bit position conversion, and the speculation is reversed.
4654 
4655   // TODO(jpp): refactor this method.
4656   assert(Ty == IceType_i32 || Ty == IceType_i64);
4657   const Type DestTy = IceType_i32;
4658   Variable *T = makeReg(DestTy);
4659   Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
4660   if (Cttz) {
4661     _bsf(T, FirstValRM);
4662   } else {
4663     _bsr(T, FirstValRM);
4664   }
4665   Variable *T_Dest = makeReg(DestTy);
4666   Constant *_31 = Ctx->getConstantInt32(31);
4667   Constant *_32 = Ctx->getConstantInt(DestTy, 32);
4668   Constant *_63 = Ctx->getConstantInt(DestTy, 63);
4669   Constant *_64 = Ctx->getConstantInt(DestTy, 64);
4670   if (Cttz) {
4671     if (DestTy == IceType_i64) {
4672       _mov(T_Dest, _64);
4673     } else {
4674       _mov(T_Dest, _32);
4675     }
4676   } else {
4677     Constant *_127 = Ctx->getConstantInt(DestTy, 127);
4678     if (DestTy == IceType_i64) {
4679       _mov(T_Dest, _127);
4680     } else {
4681       _mov(T_Dest, _63);
4682     }
4683   }
4684   _cmov(T_Dest, T, CondX86::Br_ne);
4685   if (!Cttz) {
4686     if (DestTy == IceType_i64) {
4687       // Even though there's a _63 available at this point, that constant
4688       // might not be an i32, which will cause the xor emission to fail.
4689       Constant *_63 = Ctx->getConstantInt32(63);
4690       _xor(T_Dest, _63);
4691     } else {
4692       _xor(T_Dest, _31);
4693     }
4694   }
4695   if (Ty == IceType_i32) {
4696     _mov(Dest, T_Dest);
4697     return;
4698   }
4699   _add(T_Dest, _32);
4700   auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4701   auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4702   // Will be using "test" on this, so we need a registerized variable.
4703   Variable *SecondVar = legalizeToReg(SecondVal);
4704   Variable *T_Dest2 = makeReg(IceType_i32);
4705   if (Cttz) {
4706     _bsf(T_Dest2, SecondVar);
4707   } else {
4708     _bsr(T_Dest2, SecondVar);
4709     _xor(T_Dest2, _31);
4710   }
4711   _test(SecondVar, SecondVar);
4712   _cmov(T_Dest2, T_Dest, CondX86::Br_e);
4713   _mov(DestLo, T_Dest2);
4714   _mov(DestHi, Ctx->getConstantZero(IceType_i32));
4715 }
4716 
typedLoad(Type Ty,Variable * Dest,Variable * Base,Constant * Offset)4717 void TargetX8632::typedLoad(Type Ty, Variable *Dest, Variable *Base,
4718                             Constant *Offset) {
4719   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4720   // legalize Mem properly.
4721   if (Offset)
4722     assert(!llvm::isa<ConstantRelocatable>(Offset));
4723 
4724   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4725 
4726   if (isVectorType(Ty))
4727     _movp(Dest, Mem);
4728   else if (Ty == IceType_f64)
4729     _movq(Dest, Mem);
4730   else
4731     _mov(Dest, Mem);
4732 }
4733 
typedStore(Type Ty,Variable * Value,Variable * Base,Constant * Offset)4734 void TargetX8632::typedStore(Type Ty, Variable *Value, Variable *Base,
4735                              Constant *Offset) {
4736   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4737   // legalize Mem properly.
4738   if (Offset)
4739     assert(!llvm::isa<ConstantRelocatable>(Offset));
4740 
4741   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4742 
4743   if (isVectorType(Ty))
4744     _storep(Value, Mem);
4745   else if (Ty == IceType_f64)
4746     _storeq(Value, Mem);
4747   else
4748     _store(Value, Mem);
4749 }
4750 
copyMemory(Type Ty,Variable * Dest,Variable * Src,int32_t OffsetAmt)4751 void TargetX8632::copyMemory(Type Ty, Variable *Dest, Variable *Src,
4752                              int32_t OffsetAmt) {
4753   Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4754   // TODO(ascull): this or add nullptr test to _movp, _movq
4755   Variable *Data = makeReg(Ty);
4756 
4757   typedLoad(Ty, Data, Src, Offset);
4758   typedStore(Ty, Data, Dest, Offset);
4759 }
4760 
lowerMemcpy(Operand * Dest,Operand * Src,Operand * Count)4761 void TargetX8632::lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count) {
4762   // There is a load and store for each chunk in the unroll
4763   constexpr uint32_t BytesPerStorep = 16;
4764 
4765   // Check if the operands are constants
4766   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4767   const bool IsCountConst = CountConst != nullptr;
4768   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4769 
4770   if (shouldOptimizeMemIntrins() && IsCountConst &&
4771       CountValue <= BytesPerStorep * MEMCPY_UNROLL_LIMIT) {
4772     // Unlikely, but nothing to do if it does happen
4773     if (CountValue == 0)
4774       return;
4775 
4776     Variable *SrcBase = legalizeToReg(Src);
4777     Variable *DestBase = legalizeToReg(Dest);
4778 
4779     // Find the largest type that can be used and use it as much as possible
4780     // in reverse order. Then handle any remainder with overlapping copies.
4781     // Since the remainder will be at the end, there will be reduced pressure
4782     // on the memory unit as the accesses to the same memory are far apart.
4783     Type Ty = largestTypeInSize(CountValue);
4784     uint32_t TyWidth = typeWidthInBytes(Ty);
4785 
4786     uint32_t RemainingBytes = CountValue;
4787     int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4788     while (RemainingBytes >= TyWidth) {
4789       copyMemory(Ty, DestBase, SrcBase, Offset);
4790       RemainingBytes -= TyWidth;
4791       Offset -= TyWidth;
4792     }
4793 
4794     if (RemainingBytes == 0)
4795       return;
4796 
4797     // Lower the remaining bytes. Adjust to larger types in order to make use
4798     // of overlaps in the copies.
4799     Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4800     Offset = CountValue - typeWidthInBytes(LeftOverTy);
4801     copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
4802     return;
4803   }
4804 
4805   // Fall back on a function call
4806   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
4807   Call->addArg(Dest);
4808   Call->addArg(Src);
4809   Call->addArg(Count);
4810   lowerCall(Call);
4811 }
4812 
lowerMemmove(Operand * Dest,Operand * Src,Operand * Count)4813 void TargetX8632::lowerMemmove(Operand *Dest, Operand *Src, Operand *Count) {
4814   // There is a load and store for each chunk in the unroll
4815   constexpr uint32_t BytesPerStorep = 16;
4816 
4817   // Check if the operands are constants
4818   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4819   const bool IsCountConst = CountConst != nullptr;
4820   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4821 
4822   if (shouldOptimizeMemIntrins() && IsCountConst &&
4823       CountValue <= BytesPerStorep * MEMMOVE_UNROLL_LIMIT) {
4824     // Unlikely, but nothing to do if it does happen
4825     if (CountValue == 0)
4826       return;
4827 
4828     Variable *SrcBase = legalizeToReg(Src);
4829     Variable *DestBase = legalizeToReg(Dest);
4830 
4831     std::tuple<Type, Constant *, Variable *> Moves[MEMMOVE_UNROLL_LIMIT];
4832     Constant *Offset;
4833     Variable *Reg;
4834 
4835     // Copy the data into registers as the source and destination could
4836     // overlap so make sure not to clobber the memory. This also means
4837     // overlapping moves can be used as we are taking a safe snapshot of the
4838     // memory.
4839     Type Ty = largestTypeInSize(CountValue);
4840     uint32_t TyWidth = typeWidthInBytes(Ty);
4841 
4842     uint32_t RemainingBytes = CountValue;
4843     int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
4844     size_t N = 0;
4845     while (RemainingBytes >= TyWidth) {
4846       assert(N <= MEMMOVE_UNROLL_LIMIT);
4847       Offset = Ctx->getConstantInt32(OffsetAmt);
4848       Reg = makeReg(Ty);
4849       typedLoad(Ty, Reg, SrcBase, Offset);
4850       RemainingBytes -= TyWidth;
4851       OffsetAmt -= TyWidth;
4852       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4853     }
4854 
4855     if (RemainingBytes != 0) {
4856       // Lower the remaining bytes. Adjust to larger types in order to make
4857       // use of overlaps in the copies.
4858       assert(N <= MEMMOVE_UNROLL_LIMIT);
4859       Ty = firstTypeThatFitsSize(RemainingBytes);
4860       Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
4861       Reg = makeReg(Ty);
4862       typedLoad(Ty, Reg, SrcBase, Offset);
4863       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4864     }
4865 
4866     // Copy the data out into the destination memory
4867     for (size_t i = 0; i < N; ++i) {
4868       std::tie(Ty, Offset, Reg) = Moves[i];
4869       typedStore(Ty, Reg, DestBase, Offset);
4870     }
4871 
4872     return;
4873   }
4874 
4875   // Fall back on a function call
4876   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
4877   Call->addArg(Dest);
4878   Call->addArg(Src);
4879   Call->addArg(Count);
4880   lowerCall(Call);
4881 }
4882 
lowerMemset(Operand * Dest,Operand * Val,Operand * Count)4883 void TargetX8632::lowerMemset(Operand *Dest, Operand *Val, Operand *Count) {
4884   constexpr uint32_t BytesPerStorep = 16;
4885   constexpr uint32_t BytesPerStoreq = 8;
4886   constexpr uint32_t BytesPerStorei32 = 4;
4887   assert(Val->getType() == IceType_i8);
4888 
4889   // Check if the operands are constants
4890   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4891   const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
4892   const bool IsCountConst = CountConst != nullptr;
4893   const bool IsValConst = ValConst != nullptr;
4894   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4895   const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
4896 
4897   // Unlikely, but nothing to do if it does happen
4898   if (IsCountConst && CountValue == 0)
4899     return;
4900 
4901   // TODO(ascull): if the count is constant but val is not it would be
4902   // possible to inline by spreading the value across 4 bytes and accessing
4903   // subregs e.g. eax, ax and al.
4904   if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
4905     Variable *Base = nullptr;
4906     Variable *VecReg = nullptr;
4907     const uint32_t MaskValue = (ValValue & 0xff);
4908     const uint32_t SpreadValue =
4909         (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
4910 
4911     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
4912                                                         uint32_t OffsetAmt) {
4913       assert(Base != nullptr);
4914       Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4915 
4916       // TODO(ascull): is 64-bit better with vector or scalar movq?
4917       auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4918       if (isVectorType(Ty)) {
4919         assert(VecReg != nullptr);
4920         _storep(VecReg, Mem);
4921       } else if (Ty == IceType_f64) {
4922         assert(VecReg != nullptr);
4923         _storeq(VecReg, Mem);
4924       } else {
4925         assert(Ty != IceType_i64);
4926         _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
4927       }
4928     };
4929 
4930     // Find the largest type that can be used and use it as much as possible
4931     // in reverse order. Then handle any remainder with overlapping copies.
4932     // Since the remainder will be at the end, there will be reduces pressure
4933     // on the memory unit as the access to the same memory are far apart.
4934     Type Ty = IceType_void;
4935     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
4936         CountValue <= BytesPerStorep * MEMSET_UNROLL_LIMIT) {
4937       // When the value is zero it can be loaded into a vector register
4938       // cheaply using the xor trick.
4939       Base = legalizeToReg(Dest);
4940       VecReg = makeVectorOfZeros(IceType_v16i8);
4941       Ty = largestTypeInSize(CountValue);
4942     } else if (CountValue <= BytesPerStorei32 * MEMSET_UNROLL_LIMIT) {
4943       // When the value is non-zero or the count is small we can't use vector
4944       // instructions so are limited to 32-bit stores.
4945       Base = legalizeToReg(Dest);
4946       constexpr uint32_t MaxSize = 4;
4947       Ty = largestTypeInSize(CountValue, MaxSize);
4948     }
4949 
4950     if (Base) {
4951       uint32_t TyWidth = typeWidthInBytes(Ty);
4952 
4953       uint32_t RemainingBytes = CountValue;
4954       uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4955       while (RemainingBytes >= TyWidth) {
4956         lowerSet(Ty, Offset);
4957         RemainingBytes -= TyWidth;
4958         Offset -= TyWidth;
4959       }
4960 
4961       if (RemainingBytes == 0)
4962         return;
4963 
4964       // Lower the remaining bytes. Adjust to larger types in order to make
4965       // use of overlaps in the copies.
4966       Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4967       Offset = CountValue - typeWidthInBytes(LeftOverTy);
4968       lowerSet(LeftOverTy, Offset);
4969       return;
4970     }
4971   }
4972 
4973   // Fall back on calling the memset function. The value operand needs to be
4974   // extended to a stack slot size because the PNaCl ABI requires arguments to
4975   // be at least 32 bits wide.
4976   Operand *ValExt;
4977   if (IsValConst) {
4978     ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
4979   } else {
4980     Variable *ValExtVar = Func->makeVariable(stackSlotType());
4981     lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
4982     ValExt = ValExtVar;
4983   }
4984   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
4985   Call->addArg(Dest);
4986   Call->addArg(ValExt);
4987   Call->addArg(Count);
4988   lowerCall(Call);
4989 }
4990 
4991 class AddressOptimizer {
4992   AddressOptimizer() = delete;
4993   AddressOptimizer(const AddressOptimizer &) = delete;
4994   AddressOptimizer &operator=(const AddressOptimizer &) = delete;
4995 
4996 public:
AddressOptimizer(const Cfg * Func)4997   explicit AddressOptimizer(const Cfg *Func)
4998       : Func(Func), VMetadata(Func->getVMetadata()) {}
4999 
5000   inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
5001                              int32_t Offset, const Variable *Base,
5002                              const Variable *Index, uint16_t Shift,
5003                              const Inst *Reason) const;
5004 
5005   inline const Inst *matchAssign(Variable **Var,
5006                                  ConstantRelocatable **Relocatable,
5007                                  int32_t *Offset);
5008 
5009   inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
5010                                             uint16_t *Shift);
5011 
5012   inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
5013 
5014   inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
5015                                             const uint16_t Shift,
5016                                             ConstantRelocatable **Relocatable,
5017                                             int32_t *Offset);
5018 
5019 private:
5020   const Cfg *const Func;
5021   const VariablesMetadata *const VMetadata;
5022 
isAdd(const Inst * Instr)5023   static bool isAdd(const Inst *Instr) {
5024     if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
5025       return (Arith->getOp() == InstArithmetic::Add);
5026     }
5027     return false;
5028   }
5029 };
5030 
dumpAddressOpt(const ConstantRelocatable * const Relocatable,int32_t Offset,const Variable * Base,const Variable * Index,uint16_t Shift,const Inst * Reason) const5031 void AddressOptimizer::dumpAddressOpt(
5032     const ConstantRelocatable *const Relocatable, int32_t Offset,
5033     const Variable *Base, const Variable *Index, uint16_t Shift,
5034     const Inst *Reason) const {
5035   if (!BuildDefs::dump())
5036     return;
5037   if (!Func->isVerbose(IceV_AddrOpt))
5038     return;
5039   OstreamLocker L(Func->getContext());
5040   Ostream &Str = Func->getContext()->getStrDump();
5041   Str << "Instruction: ";
5042   Reason->dumpDecorated(Func);
5043   Str << "  results in Base=";
5044   if (Base)
5045     Base->dump(Func);
5046   else
5047     Str << "<null>";
5048   Str << ", Index=";
5049   if (Index)
5050     Index->dump(Func);
5051   else
5052     Str << "<null>";
5053   Str << ", Shift=" << Shift << ", Offset=" << Offset
5054       << ", Relocatable=" << Relocatable << "\n";
5055 }
5056 
matchAssign(Variable ** Var,ConstantRelocatable ** Relocatable,int32_t * Offset)5057 const Inst *AddressOptimizer::matchAssign(Variable **Var,
5058                                           ConstantRelocatable **Relocatable,
5059                                           int32_t *Offset) {
5060   // Var originates from Var=SrcVar ==> set Var:=SrcVar
5061   if (*Var == nullptr)
5062     return nullptr;
5063   if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
5064     assert(!VMetadata->isMultiDef(*Var));
5065     if (llvm::isa<InstAssign>(VarAssign)) {
5066       Operand *SrcOp = VarAssign->getSrc(0);
5067       assert(SrcOp);
5068       if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
5069         if (!VMetadata->isMultiDef(SrcVar) &&
5070             // TODO: ensure SrcVar stays single-BB
5071             true) {
5072           *Var = SrcVar;
5073           return VarAssign;
5074         }
5075       } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
5076         int32_t MoreOffset = Const->getValue();
5077         if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
5078           return nullptr;
5079         *Var = nullptr;
5080         *Offset += MoreOffset;
5081         return VarAssign;
5082       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
5083         if (*Relocatable == nullptr) {
5084           // It is always safe to fold a relocatable through assignment -- the
5085           // assignment frees a slot in the address operand that can be used
5086           // to hold the Sandbox Pointer -- if any.
5087           *Var = nullptr;
5088           *Relocatable = AddReloc;
5089           return VarAssign;
5090         }
5091       }
5092     }
5093   }
5094   return nullptr;
5095 }
5096 
matchCombinedBaseIndex(Variable ** Base,Variable ** Index,uint16_t * Shift)5097 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
5098                                                      Variable **Index,
5099                                                      uint16_t *Shift) {
5100   // Index==nullptr && Base is Base=Var1+Var2 ==>
5101   //   set Base=Var1, Index=Var2, Shift=0
5102   if (*Base == nullptr)
5103     return nullptr;
5104   if (*Index != nullptr)
5105     return nullptr;
5106   auto *BaseInst = VMetadata->getSingleDefinition(*Base);
5107   if (BaseInst == nullptr)
5108     return nullptr;
5109   assert(!VMetadata->isMultiDef(*Base));
5110   if (BaseInst->getSrcSize() < 2)
5111     return nullptr;
5112   if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
5113     if (VMetadata->isMultiDef(Var1))
5114       return nullptr;
5115     if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
5116       if (VMetadata->isMultiDef(Var2))
5117         return nullptr;
5118       if (isAdd(BaseInst) &&
5119           // TODO: ensure Var1 and Var2 stay single-BB
5120           true) {
5121         *Base = Var1;
5122         *Index = Var2;
5123         *Shift = 0; // should already have been 0
5124         return BaseInst;
5125       }
5126     }
5127   }
5128   return nullptr;
5129 }
5130 
matchShiftedIndex(Variable ** Index,uint16_t * Shift)5131 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
5132                                                 uint16_t *Shift) {
5133   // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
5134   //   Index=Var, Shift+=log2(Const)
5135   if (*Index == nullptr)
5136     return nullptr;
5137   auto *IndexInst = VMetadata->getSingleDefinition(*Index);
5138   if (IndexInst == nullptr)
5139     return nullptr;
5140   assert(!VMetadata->isMultiDef(*Index));
5141 
5142   // When using an unsigned 32-bit array index on x64, it gets zero-extended
5143   // before the shift & add. The explicit zero extension can be eliminated
5144   // because x86 32-bit operations automatically get zero-extended into the
5145   // corresponding 64-bit register.
5146   if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
5147     if (CastInst->getCastKind() == InstCast::Zext) {
5148       if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
5149         if (Var->getType() == IceType_i32 &&
5150             CastInst->getDest()->getType() == IceType_i64) {
5151           IndexInst = VMetadata->getSingleDefinition(Var);
5152         }
5153       }
5154     }
5155   }
5156 
5157   if (IndexInst->getSrcSize() < 2)
5158     return nullptr;
5159   if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
5160     if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
5161       if (auto *Const =
5162               llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
5163         if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
5164           return nullptr;
5165         switch (ArithInst->getOp()) {
5166         default:
5167           return nullptr;
5168         case InstArithmetic::Mul: {
5169           uint32_t Mult = Const->getValue();
5170           uint32_t LogMult;
5171           switch (Mult) {
5172           case 1:
5173             LogMult = 0;
5174             break;
5175           case 2:
5176             LogMult = 1;
5177             break;
5178           case 4:
5179             LogMult = 2;
5180             break;
5181           case 8:
5182             LogMult = 3;
5183             break;
5184           default:
5185             return nullptr;
5186           }
5187           if (*Shift + LogMult <= 3) {
5188             *Index = Var;
5189             *Shift += LogMult;
5190             return IndexInst;
5191           }
5192         }
5193         case InstArithmetic::Shl: {
5194           uint32_t ShiftAmount = Const->getValue();
5195           switch (ShiftAmount) {
5196           case 0:
5197           case 1:
5198           case 2:
5199           case 3:
5200             break;
5201           default:
5202             return nullptr;
5203           }
5204           if (*Shift + ShiftAmount <= 3) {
5205             *Index = Var;
5206             *Shift += ShiftAmount;
5207             return IndexInst;
5208           }
5209         }
5210         }
5211       }
5212     }
5213   }
5214   return nullptr;
5215 }
5216 
matchOffsetIndexOrBase(Variable ** IndexOrBase,const uint16_t Shift,ConstantRelocatable ** Relocatable,int32_t * Offset)5217 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
5218     Variable **IndexOrBase, const uint16_t Shift,
5219     ConstantRelocatable **Relocatable, int32_t *Offset) {
5220   // Base is Base=Var+Const || Base is Base=Const+Var ==>
5221   //   set Base=Var, Offset+=Const
5222   // Base is Base=Var-Const ==>
5223   //   set Base=Var, Offset-=Const
5224   // Index is Index=Var+Const ==>
5225   //   set Index=Var, Offset+=(Const<<Shift)
5226   // Index is Index=Const+Var ==>
5227   //   set Index=Var, Offset+=(Const<<Shift)
5228   // Index is Index=Var-Const ==>
5229   //   set Index=Var, Offset-=(Const<<Shift)
5230   // Treat Index=Var Or Const as Index=Var + Const
5231   //    when Var = Var' << N and log2(Const) <= N
5232   // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
5233 
5234   if (*IndexOrBase == nullptr) {
5235     return nullptr;
5236   }
5237   const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
5238   if (Definition == nullptr) {
5239     return nullptr;
5240   }
5241   assert(!VMetadata->isMultiDef(*IndexOrBase));
5242   if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
5243     switch (ArithInst->getOp()) {
5244     case InstArithmetic::Add:
5245     case InstArithmetic::Sub:
5246     case InstArithmetic::Or:
5247       break;
5248     default:
5249       return nullptr;
5250     }
5251 
5252     Operand *Src0 = ArithInst->getSrc(0);
5253     Operand *Src1 = ArithInst->getSrc(1);
5254     auto *Var0 = llvm::dyn_cast<Variable>(Src0);
5255     auto *Var1 = llvm::dyn_cast<Variable>(Src1);
5256     auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
5257     auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
5258     auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
5259     auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
5260 
5261     bool IsAdd = false;
5262     if (ArithInst->getOp() == InstArithmetic::Or) {
5263       Variable *Var = nullptr;
5264       ConstantInteger32 *Const = nullptr;
5265       if (Var0 && Const1) {
5266         Var = Var0;
5267         Const = Const1;
5268       } else if (Const0 && Var1) {
5269         Var = Var1;
5270         Const = Const0;
5271       } else {
5272         return nullptr;
5273       }
5274       auto *VarDef =
5275           llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
5276       if (VarDef == nullptr)
5277         return nullptr;
5278 
5279       SizeT ZeroesAvailable = 0;
5280       if (VarDef->getOp() == InstArithmetic::Shl) {
5281         if (auto *ConstInt =
5282                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5283           ZeroesAvailable = ConstInt->getValue();
5284         }
5285       } else if (VarDef->getOp() == InstArithmetic::Mul) {
5286         SizeT PowerOfTwo = 0;
5287         if (auto *MultConst =
5288                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
5289           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5290             PowerOfTwo += MultConst->getValue();
5291           }
5292         }
5293         if (auto *MultConst =
5294                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5295           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5296             PowerOfTwo += MultConst->getValue();
5297           }
5298         }
5299         ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
5300       }
5301       SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
5302       if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
5303         return nullptr;
5304       IsAdd = true; // treat it as an add if the above conditions hold
5305     } else {
5306       IsAdd = ArithInst->getOp() == InstArithmetic::Add;
5307     }
5308 
5309     Variable *NewIndexOrBase = nullptr;
5310     int32_t NewOffset = 0;
5311     ConstantRelocatable *NewRelocatable = *Relocatable;
5312     if (Var0 && Var1)
5313       // TODO(sehr): merge base/index splitting into here.
5314       return nullptr;
5315     if (!IsAdd && Var1)
5316       return nullptr;
5317     if (Var0)
5318       NewIndexOrBase = Var0;
5319     else if (Var1)
5320       NewIndexOrBase = Var1;
5321     // Don't know how to add/subtract two relocatables.
5322     if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
5323       return nullptr;
5324     // Don't know how to subtract a relocatable.
5325     if (!IsAdd && Reloc1)
5326       return nullptr;
5327     // Incorporate ConstantRelocatables.
5328     if (Reloc0)
5329       NewRelocatable = Reloc0;
5330     else if (Reloc1)
5331       NewRelocatable = Reloc1;
5332     // Compute the updated constant offset.
5333     if (Const0) {
5334       const int32_t MoreOffset =
5335           IsAdd ? Const0->getValue() : -Const0->getValue();
5336       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5337         return nullptr;
5338       NewOffset += MoreOffset;
5339     }
5340     if (Const1) {
5341       const int32_t MoreOffset =
5342           IsAdd ? Const1->getValue() : -Const1->getValue();
5343       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5344         return nullptr;
5345       NewOffset += MoreOffset;
5346     }
5347     if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
5348       return nullptr;
5349     *IndexOrBase = NewIndexOrBase;
5350     *Offset += (NewOffset << Shift);
5351     // Shift is always zero if this is called with the base
5352     *Relocatable = NewRelocatable;
5353     return Definition;
5354   }
5355   return nullptr;
5356 }
5357 
computeAddressOpt(const Inst * Instr,Type MemType,Operand * Addr)5358 X86OperandMem *TargetX8632::computeAddressOpt(const Inst *Instr, Type MemType,
5359                                               Operand *Addr) {
5360   Func->resetCurrentNode();
5361   if (Func->isVerbose(IceV_AddrOpt)) {
5362     OstreamLocker L(Func->getContext());
5363     Ostream &Str = Func->getContext()->getStrDump();
5364     Str << "\nStarting computeAddressOpt for instruction:\n  ";
5365     Instr->dumpDecorated(Func);
5366   }
5367 
5368   OptAddr NewAddr;
5369   NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
5370   if (NewAddr.Base == nullptr)
5371     return nullptr;
5372 
5373   // If the Base has more than one use or is live across multiple blocks, then
5374   // don't go further. Alternatively (?), never consider a transformation that
5375   // would change a variable that is currently *not* live across basic block
5376   // boundaries into one that *is*.
5377   if (!getFlags().getLoopInvariantCodeMotion()) {
5378     // Need multi block address opt when licm is enabled.
5379     // Might make sense to restrict to current node and loop header.
5380     if (Func->getVMetadata()->isMultiBlock(
5381             NewAddr.Base) /* || Base->getUseCount() > 1*/)
5382       return nullptr;
5383   }
5384   AddressOptimizer AddrOpt(Func);
5385   const bool MockBounds = getFlags().getMockBoundsCheck();
5386   const Inst *Reason = nullptr;
5387   bool AddressWasOptimized = false;
5388   // The following unnamed struct identifies the address mode formation steps
5389   // that could potentially create an invalid memory operand (i.e., no free
5390   // slots for RebasePtr.) We add all those variables to this struct so that
5391   // we can use memset() to reset all members to false.
5392   struct {
5393     bool AssignBase = false;
5394     bool AssignIndex = false;
5395     bool OffsetFromBase = false;
5396     bool OffsetFromIndex = false;
5397     bool CombinedBaseIndex = false;
5398   } Skip;
5399   // NewAddrCheckpoint is used to rollback the address being formed in case an
5400   // invalid address is formed.
5401   OptAddr NewAddrCheckpoint;
5402   Reason = Instr;
5403   do {
5404     if (Reason) {
5405       AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
5406                              NewAddr.Index, NewAddr.Shift, Reason);
5407       AddressWasOptimized = true;
5408       Reason = nullptr;
5409       memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
5410     }
5411 
5412     NewAddrCheckpoint = NewAddr;
5413 
5414     // Update Base and Index to follow through assignments to definitions.
5415     if (!Skip.AssignBase &&
5416         (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
5417                                       &NewAddr.Offset))) {
5418       // Assignments of Base from a Relocatable or ConstantInt32 can result
5419       // in Base becoming nullptr.  To avoid code duplication in this loop we
5420       // prefer that Base be non-nullptr if possible.
5421       if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
5422           NewAddr.Shift == 0) {
5423         std::swap(NewAddr.Base, NewAddr.Index);
5424       }
5425       continue;
5426     }
5427     if (!Skip.AssignBase &&
5428         (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
5429                                       &NewAddr.Offset))) {
5430       continue;
5431     }
5432 
5433     if (!MockBounds) {
5434       // Transition from:
5435       //   <Relocatable + Offset>(Base) to
5436       //   <Relocatable + Offset>(Base, Index)
5437       if (!Skip.CombinedBaseIndex &&
5438           (Reason = AddrOpt.matchCombinedBaseIndex(
5439                &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
5440         continue;
5441       }
5442 
5443       // Recognize multiply/shift and update Shift amount.
5444       // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
5445       //   Index=Var, Shift+=Const
5446       // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
5447       //   Index=Var, Shift+=log2(Const)
5448       if ((Reason =
5449                AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
5450         continue;
5451       }
5452 
5453       // If Shift is zero, the choice of Base and Index was purely arbitrary.
5454       // Recognize multiply/shift and set Shift amount.
5455       // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
5456       //   swap(Index,Base)
5457       // Similar for Base=Const*Var and Base=Var<<Const
5458       if (NewAddr.Shift == 0 &&
5459           (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
5460         std::swap(NewAddr.Base, NewAddr.Index);
5461         continue;
5462       }
5463     }
5464 
5465     // Update Offset to reflect additions/subtractions with constants and
5466     // relocatables.
5467     // TODO: consider overflow issues with respect to Offset.
5468     if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
5469                                      &NewAddr.Base, /*Shift =*/0,
5470                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
5471       continue;
5472     }
5473     if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
5474                                       &NewAddr.Index, NewAddr.Shift,
5475                                       &NewAddr.Relocatable, &NewAddr.Offset))) {
5476       continue;
5477     }
5478 
5479     break;
5480   } while (Reason);
5481 
5482   if (!AddressWasOptimized) {
5483     return nullptr;
5484   }
5485 
5486   // Undo any addition of RebasePtr.  It will be added back when the mem
5487   // operand is sandboxed.
5488   if (NewAddr.Base == RebasePtr) {
5489     NewAddr.Base = nullptr;
5490   }
5491 
5492   if (NewAddr.Index == RebasePtr) {
5493     NewAddr.Index = nullptr;
5494     NewAddr.Shift = 0;
5495   }
5496 
5497   Constant *OffsetOp = nullptr;
5498   if (NewAddr.Relocatable == nullptr) {
5499     OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
5500   } else {
5501     OffsetOp =
5502         Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
5503                             NewAddr.Relocatable->getName());
5504   }
5505   // Vanilla ICE load instructions should not use the segment registers, and
5506   // computeAddressOpt only works at the level of Variables and Constants, not
5507   // other X86OperandMem, so there should be no mention of segment
5508   // registers there either.
5509   static constexpr auto SegmentReg =
5510       X86OperandMem::SegmentRegisters::DefaultSegment;
5511 
5512   return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
5513                                NewAddr.Index, NewAddr.Shift, SegmentReg);
5514 }
5515 
5516 /// Add a mock bounds check on the memory address before using it as a load or
5517 /// store operand.  The basic idea is that given a memory operand [reg], we
5518 /// would first add bounds-check code something like:
5519 ///
5520 ///   cmp reg, <lb>
5521 ///   jl out_of_line_error
5522 ///   cmp reg, <ub>
5523 ///   jg out_of_line_error
5524 ///
5525 /// In reality, the specific code will depend on how <lb> and <ub> are
5526 /// represented, e.g. an immediate, a global, or a function argument.
5527 ///
5528 /// As such, we need to enforce that the memory operand does not have the form
5529 /// [reg1+reg2], because then there is no simple cmp instruction that would
5530 /// suffice.  However, we consider [reg+offset] to be OK because the offset is
5531 /// usually small, and so <ub> could have a safety buffer built in and then we
5532 /// could instead branch to a custom out_of_line_error that does the precise
5533 /// check and jumps back if it turns out OK.
5534 ///
5535 /// For the purpose of mocking the bounds check, we'll do something like this:
5536 ///
5537 ///   cmp reg, 0
5538 ///   je label
5539 ///   cmp reg, 1
5540 ///   je label
5541 ///   label:
5542 ///
5543 /// Also note that we don't need to add a bounds check to a dereference of a
5544 /// simple global variable address.
5545 
doMockBoundsCheck(Operand * Opnd)5546 void TargetX8632::doMockBoundsCheck(Operand *Opnd) {
5547   if (!getFlags().getMockBoundsCheck())
5548     return;
5549   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
5550     if (Mem->getIndex()) {
5551       llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
5552     }
5553     Opnd = Mem->getBase();
5554   }
5555   // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
5556   // something else.  We only care if it is Variable.
5557   auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
5558   if (Var == nullptr)
5559     return;
5560   // We use lowerStore() to copy out-args onto the stack.  This creates a
5561   // memory operand with the stack pointer as the base register.  Don't do
5562   // bounds checks on that.
5563   if (Var->getRegNum() == getStackReg())
5564     return;
5565 
5566   auto *Label = InstX86Label::create(Func, this);
5567   _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
5568   _br(CondX86::Br_e, Label);
5569   _cmp(Opnd, Ctx->getConstantInt32(1));
5570   _br(CondX86::Br_e, Label);
5571   Context.insert(Label);
5572 }
5573 
lowerLoad(const InstLoad * Load)5574 void TargetX8632::lowerLoad(const InstLoad *Load) {
5575   // A Load instruction can be treated the same as an Assign instruction,
5576   // after the source operand is transformed into an X86OperandMem operand.
5577   // Note that the address mode optimization already creates an X86OperandMem
5578   // operand, so it doesn't need another level of transformation.
5579   Variable *DestLoad = Load->getDest();
5580   Type Ty = DestLoad->getType();
5581   Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
5582   doMockBoundsCheck(Src0);
5583   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
5584   lowerAssign(Assign);
5585 }
5586 
doAddressOptOther()5587 void TargetX8632::doAddressOptOther() {
5588   // Inverts some Icmp instructions which helps doAddressOptLoad later.
5589   // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
5590   Inst *Instr = iteratorToInst(Context.getCur());
5591   auto *VMetadata = Func->getVMetadata();
5592   if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
5593     if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
5594         llvm::isa<Constant>(Icmp->getSrc(1)))
5595       return;
5596     auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
5597     if (Var0 == nullptr)
5598       return;
5599     if (!VMetadata->isTracked(Var0))
5600       return;
5601     auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
5602     if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
5603       return;
5604     if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
5605       return;
5606 
5607     auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
5608     if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
5609       auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
5610       if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
5611           llvm::isa<InstLoad>(Op1Def)) {
5612         return; // Both are loads
5613       }
5614     }
5615     Icmp->reverseConditionAndOperands();
5616   }
5617 }
5618 
doAddressOptLoad()5619 void TargetX8632::doAddressOptLoad() {
5620   Inst *Instr = iteratorToInst(Context.getCur());
5621   Operand *Addr = Instr->getSrc(0);
5622   Variable *Dest = Instr->getDest();
5623   if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
5624     Instr->setDeleted();
5625     Context.insert<InstLoad>(Dest, OptAddr);
5626   }
5627 }
5628 
doAddressOptLoadSubVector()5629 void TargetX8632::doAddressOptLoadSubVector() {
5630   auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
5631   Operand *Addr = Intrinsic->getArg(0);
5632   Variable *Dest = Intrinsic->getDest();
5633   if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
5634     Intrinsic->setDeleted();
5635     const Ice::Intrinsics::IntrinsicInfo Info = {
5636         Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
5637         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
5638     auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
5639     NewLoad->addArg(OptAddr);
5640     NewLoad->addArg(Intrinsic->getArg(1));
5641   }
5642 }
5643 
lowerPhi(const InstPhi *)5644 void TargetX8632::lowerPhi(const InstPhi * /*Instr*/) {
5645   Func->setError("Phi found in regular instruction list");
5646 }
5647 
lowerRet(const InstRet * Instr)5648 void TargetX8632::lowerRet(const InstRet *Instr) {
5649   Variable *Reg = nullptr;
5650   if (Instr->hasRetValue()) {
5651     Operand *RetValue = legalize(Instr->getRetValue());
5652     const Type ReturnType = RetValue->getType();
5653     assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
5654            (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
5655     Reg = moveReturnValueToRegister(RetValue, ReturnType);
5656   }
5657   // Add a ret instruction even if sandboxing is enabled, because addEpilog
5658   // explicitly looks for a ret instruction as a marker for where to insert
5659   // the frame removal instructions.
5660   _ret(Reg);
5661   // Add a fake use of esp to make sure esp stays alive for the entire
5662   // function. Otherwise post-call esp adjustments get dead-code eliminated.
5663   keepEspLiveAtExit();
5664 }
5665 
makePshufdMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5666 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
5667                                SizeT Index3) {
5668   const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
5669                      ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
5670   assert(Mask < 256);
5671   return Mask;
5672 }
5673 
lowerShuffleVector_AllFromSameSrc(Operand * Src,SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5674 Variable *TargetX8632::lowerShuffleVector_AllFromSameSrc(
5675     Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
5676   constexpr SizeT SrcBit = 1 << 2;
5677   assert((Index0 & SrcBit) == (Index1 & SrcBit));
5678   assert((Index0 & SrcBit) == (Index2 & SrcBit));
5679   assert((Index0 & SrcBit) == (Index3 & SrcBit));
5680   (void)SrcBit;
5681 
5682   const Type SrcTy = Src->getType();
5683   auto *T = makeReg(SrcTy);
5684   auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
5685   auto *Mask =
5686       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5687   _pshufd(T, SrcRM, Mask);
5688   return T;
5689 }
5690 
5691 Variable *
lowerShuffleVector_TwoFromSameSrc(Operand * Src0,SizeT Index0,SizeT Index1,Operand * Src1,SizeT Index2,SizeT Index3)5692 TargetX8632::lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
5693                                                SizeT Index1, Operand *Src1,
5694                                                SizeT Index2, SizeT Index3) {
5695   constexpr SizeT SrcBit = 1 << 2;
5696   assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
5697   assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
5698   (void)SrcBit;
5699 
5700   const Type SrcTy = Src0->getType();
5701   assert(Src1->getType() == SrcTy);
5702   auto *T = makeReg(SrcTy);
5703   auto *Src0R = legalizeToReg(Src0);
5704   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5705   auto *Mask =
5706       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5707   _movp(T, Src0R);
5708   _shufps(T, Src1RM, Mask);
5709   return T;
5710 }
5711 
lowerShuffleVector_UnifyFromDifferentSrcs(Operand * Src0,SizeT Index0,Operand * Src1,SizeT Index1)5712 Variable *TargetX8632::lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
5713                                                                  SizeT Index0,
5714                                                                  Operand *Src1,
5715                                                                  SizeT Index1) {
5716   return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
5717                                            Index1, IGNORE_INDEX);
5718 }
5719 
makeSrcSwitchMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5720 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
5721                                SizeT Index3) {
5722   constexpr SizeT SrcBit = 1 << 2;
5723   const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
5724   const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
5725   const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
5726   const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
5727   return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
5728 }
5729 
lowerShuffleVector_NewMaskName()5730 GlobalString TargetX8632::lowerShuffleVector_NewMaskName() {
5731   GlobalString FuncName = Func->getFunctionName();
5732   const SizeT Id = PshufbMaskCount++;
5733   if (!BuildDefs::dump() || !FuncName.hasStdString()) {
5734     return GlobalString::createWithString(
5735         Ctx,
5736         "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
5737   }
5738   return GlobalString::createWithString(
5739       Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
5740 }
5741 
lowerShuffleVector_CreatePshufbMask(int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5742 ConstantRelocatable *TargetX8632::lowerShuffleVector_CreatePshufbMask(
5743     int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
5744     int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
5745     int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
5746     int8_t Idx15) {
5747   static constexpr uint8_t NumElements = 16;
5748   const char Initializer[NumElements] = {
5749       Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
5750       Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
5751   };
5752 
5753   static constexpr Type V4VectorType = IceType_v4i32;
5754   const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
5755   auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
5756   GlobalString MaskName = lowerShuffleVector_NewMaskName();
5757   Mask->setIsConstant(true);
5758   Mask->addInitializer(VariableDeclaration::DataInitializer::create(
5759       Func->getGlobalPool(), Initializer, NumElements));
5760   Mask->setName(MaskName);
5761   // Mask needs to be 16-byte aligned, or pshufb will seg fault.
5762   Mask->setAlignment(MaskAlignment);
5763   Func->addGlobal(Mask);
5764 
5765   constexpr RelocOffsetT Offset = 0;
5766   return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
5767 }
5768 
lowerShuffleVector_UsingPshufb(Variable * Dest,Operand * Src0,Operand * Src1,int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5769 void TargetX8632::lowerShuffleVector_UsingPshufb(
5770     Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
5771     int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
5772     int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
5773     int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
5774   const Type DestTy = Dest->getType();
5775   static constexpr bool NotRebased = false;
5776   static constexpr Variable *NoBase = nullptr;
5777   // We use void for the memory operand instead of DestTy because using the
5778   // latter causes a validation failure: the X86 Inst layer complains that
5779   // vector mem operands could be under aligned. Thus, using void we avoid the
5780   // validation error. Note that the mask global declaration is aligned, so it
5781   // can be used as an XMM mem operand.
5782   static constexpr Type MaskType = IceType_void;
5783 #define IDX_IN_SRC(N, S)                                                       \
5784   ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
5785   auto *Mask0M = X86OperandMem::create(
5786       Func, MaskType, NoBase,
5787       lowerShuffleVector_CreatePshufbMask(
5788           IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
5789           IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
5790           IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
5791           IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
5792           IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
5793           IDX_IN_SRC(Idx15, 0)),
5794       NotRebased);
5795 
5796   auto *T0 = makeReg(DestTy);
5797   auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5798   _movp(T0, Src0RM);
5799 
5800   _pshufb(T0, Mask0M);
5801 
5802   if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
5803       Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
5804       Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
5805       Idx15 >= 16) {
5806     auto *Mask1M = X86OperandMem::create(
5807         Func, MaskType, NoBase,
5808         lowerShuffleVector_CreatePshufbMask(
5809             IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
5810             IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
5811             IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
5812             IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
5813             IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
5814             IDX_IN_SRC(Idx15, 1)),
5815         NotRebased);
5816 #undef IDX_IN_SRC
5817     auto *T1 = makeReg(DestTy);
5818     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5819     _movp(T1, Src1RM);
5820     _pshufb(T1, Mask1M);
5821     _por(T0, T1);
5822   }
5823 
5824   _movp(Dest, T0);
5825 }
5826 
lowerShuffleVector(const InstShuffleVector * Instr)5827 void TargetX8632::lowerShuffleVector(const InstShuffleVector *Instr) {
5828   auto *Dest = Instr->getDest();
5829   const Type DestTy = Dest->getType();
5830   auto *Src0 = Instr->getSrc(0);
5831   auto *Src1 = Instr->getSrc(1);
5832   const SizeT NumElements = typeNumElements(DestTy);
5833 
5834   auto *T = makeReg(DestTy);
5835 
5836   switch (DestTy) {
5837   default:
5838     llvm::report_fatal_error("Unexpected vector type.");
5839   case IceType_v16i1:
5840   case IceType_v16i8: {
5841     static constexpr SizeT ExpectedNumElements = 16;
5842     assert(ExpectedNumElements == Instr->getNumIndexes());
5843     (void)ExpectedNumElements;
5844 
5845     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
5846       auto *T = makeReg(DestTy);
5847       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5848       _movp(T, Src0RM);
5849       _punpckl(T, Src0RM);
5850       _movp(Dest, T);
5851       return;
5852     }
5853 
5854     if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
5855                           23)) {
5856       auto *T = makeReg(DestTy);
5857       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5858       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5859       _movp(T, Src0RM);
5860       _punpckl(T, Src1RM);
5861       _movp(Dest, T);
5862       return;
5863     }
5864 
5865     if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
5866                           15, 15)) {
5867       auto *T = makeReg(DestTy);
5868       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5869       _movp(T, Src0RM);
5870       _punpckh(T, Src0RM);
5871       _movp(Dest, T);
5872       return;
5873     }
5874 
5875     if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
5876                           15, 31)) {
5877       auto *T = makeReg(DestTy);
5878       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5879       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5880       _movp(T, Src0RM);
5881       _punpckh(T, Src1RM);
5882       _movp(Dest, T);
5883       return;
5884     }
5885 
5886     if (InstructionSet < SSE4_1) {
5887       // TODO(jpp): figure out how to lower with sse2.
5888       break;
5889     }
5890 
5891     const SizeT Index0 = Instr->getIndexValue(0);
5892     const SizeT Index1 = Instr->getIndexValue(1);
5893     const SizeT Index2 = Instr->getIndexValue(2);
5894     const SizeT Index3 = Instr->getIndexValue(3);
5895     const SizeT Index4 = Instr->getIndexValue(4);
5896     const SizeT Index5 = Instr->getIndexValue(5);
5897     const SizeT Index6 = Instr->getIndexValue(6);
5898     const SizeT Index7 = Instr->getIndexValue(7);
5899     const SizeT Index8 = Instr->getIndexValue(8);
5900     const SizeT Index9 = Instr->getIndexValue(9);
5901     const SizeT Index10 = Instr->getIndexValue(10);
5902     const SizeT Index11 = Instr->getIndexValue(11);
5903     const SizeT Index12 = Instr->getIndexValue(12);
5904     const SizeT Index13 = Instr->getIndexValue(13);
5905     const SizeT Index14 = Instr->getIndexValue(14);
5906     const SizeT Index15 = Instr->getIndexValue(15);
5907 
5908     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
5909                                    Index3, Index4, Index5, Index6, Index7,
5910                                    Index8, Index9, Index10, Index11, Index12,
5911                                    Index13, Index14, Index15);
5912     return;
5913   }
5914   case IceType_v8i1:
5915   case IceType_v8i16: {
5916     static constexpr SizeT ExpectedNumElements = 8;
5917     assert(ExpectedNumElements == Instr->getNumIndexes());
5918     (void)ExpectedNumElements;
5919 
5920     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
5921       auto *T = makeReg(DestTy);
5922       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5923       _movp(T, Src0RM);
5924       _punpckl(T, Src0RM);
5925       _movp(Dest, T);
5926       return;
5927     }
5928 
5929     if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
5930       auto *T = makeReg(DestTy);
5931       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5932       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5933       _movp(T, Src0RM);
5934       _punpckl(T, Src1RM);
5935       _movp(Dest, T);
5936       return;
5937     }
5938 
5939     if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
5940       auto *T = makeReg(DestTy);
5941       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5942       _movp(T, Src0RM);
5943       _punpckh(T, Src0RM);
5944       _movp(Dest, T);
5945       return;
5946     }
5947 
5948     if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
5949       auto *T = makeReg(DestTy);
5950       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5951       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5952       _movp(T, Src0RM);
5953       _punpckh(T, Src1RM);
5954       _movp(Dest, T);
5955       return;
5956     }
5957 
5958     if (InstructionSet < SSE4_1) {
5959       // TODO(jpp): figure out how to lower with sse2.
5960       break;
5961     }
5962 
5963     const SizeT Index0 = Instr->getIndexValue(0);
5964     const SizeT Index1 = Instr->getIndexValue(1);
5965     const SizeT Index2 = Instr->getIndexValue(2);
5966     const SizeT Index3 = Instr->getIndexValue(3);
5967     const SizeT Index4 = Instr->getIndexValue(4);
5968     const SizeT Index5 = Instr->getIndexValue(5);
5969     const SizeT Index6 = Instr->getIndexValue(6);
5970     const SizeT Index7 = Instr->getIndexValue(7);
5971 
5972 #define TO_BYTE_INDEX(I) ((I) << 1)
5973     lowerShuffleVector_UsingPshufb(
5974         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
5975         TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
5976         TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
5977         TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
5978         TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
5979         TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
5980         TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
5981         TO_BYTE_INDEX(Index7) + 1);
5982 #undef TO_BYTE_INDEX
5983     return;
5984   }
5985   case IceType_v4i1:
5986   case IceType_v4i32:
5987   case IceType_v4f32: {
5988     static constexpr SizeT ExpectedNumElements = 4;
5989     assert(ExpectedNumElements == Instr->getNumIndexes());
5990     const SizeT Index0 = Instr->getIndexValue(0);
5991     const SizeT Index1 = Instr->getIndexValue(1);
5992     const SizeT Index2 = Instr->getIndexValue(2);
5993     const SizeT Index3 = Instr->getIndexValue(3);
5994     Variable *T = nullptr;
5995     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
5996 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
5997   case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
5998       CASE_SRCS_IN(0, 0, 0, 0) : {
5999         T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
6000                                               Index3);
6001       }
6002       break;
6003       CASE_SRCS_IN(0, 0, 0, 1) : {
6004         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6005                                                                   Src1, Index3);
6006         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6007                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6008       }
6009       break;
6010       CASE_SRCS_IN(0, 0, 1, 0) : {
6011         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6012                                                                   Src0, Index3);
6013         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6014                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6015       }
6016       break;
6017       CASE_SRCS_IN(0, 0, 1, 1) : {
6018         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
6019                                               Index2, Index3);
6020       }
6021       break;
6022       CASE_SRCS_IN(0, 1, 0, 0) : {
6023         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6024                                                                   Src1, Index1);
6025         T = lowerShuffleVector_TwoFromSameSrc(
6026             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6027       }
6028       break;
6029       CASE_SRCS_IN(0, 1, 0, 1) : {
6030         if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
6031             (Index3 - ExpectedNumElements) == 1) {
6032           auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6033           auto *Src0R = legalizeToReg(Src0);
6034           T = makeReg(DestTy);
6035           _movp(T, Src0R);
6036           _punpckl(T, Src1RM);
6037         } else if (Index0 == Index2 && Index1 == Index3) {
6038           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6039               Src0, Index0, Src1, Index1);
6040           T = lowerShuffleVector_AllFromSameSrc(
6041               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6042               UNIFIED_INDEX_1);
6043         } else {
6044           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6045               Src0, Index0, Src1, Index1);
6046           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6047               Src0, Index2, Src1, Index3);
6048           T = lowerShuffleVector_TwoFromSameSrc(
6049               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6050               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6051         }
6052       }
6053       break;
6054       CASE_SRCS_IN(0, 1, 1, 0) : {
6055         if (Index0 == Index3 && Index1 == Index2) {
6056           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6057               Src0, Index0, Src1, Index1);
6058           T = lowerShuffleVector_AllFromSameSrc(
6059               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6060               UNIFIED_INDEX_0);
6061         } else {
6062           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6063               Src0, Index0, Src1, Index1);
6064           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6065               Src1, Index2, Src0, Index3);
6066           T = lowerShuffleVector_TwoFromSameSrc(
6067               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6068               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6069         }
6070       }
6071       break;
6072       CASE_SRCS_IN(0, 1, 1, 1) : {
6073         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6074                                                                   Src1, Index1);
6075         T = lowerShuffleVector_TwoFromSameSrc(
6076             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6077       }
6078       break;
6079       CASE_SRCS_IN(1, 0, 0, 0) : {
6080         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6081                                                                   Src0, Index1);
6082         T = lowerShuffleVector_TwoFromSameSrc(
6083             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6084       }
6085       break;
6086       CASE_SRCS_IN(1, 0, 0, 1) : {
6087         if (Index0 == Index3 && Index1 == Index2) {
6088           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6089               Src1, Index0, Src0, Index1);
6090           T = lowerShuffleVector_AllFromSameSrc(
6091               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6092               UNIFIED_INDEX_0);
6093         } else {
6094           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6095               Src1, Index0, Src0, Index1);
6096           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6097               Src0, Index2, Src1, Index3);
6098           T = lowerShuffleVector_TwoFromSameSrc(
6099               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6100               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6101         }
6102       }
6103       break;
6104       CASE_SRCS_IN(1, 0, 1, 0) : {
6105         if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
6106             (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
6107           auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
6108           auto *Src0R = legalizeToReg(Src1);
6109           T = makeReg(DestTy);
6110           _movp(T, Src0R);
6111           _punpckl(T, Src1RM);
6112         } else if (Index0 == Index2 && Index1 == Index3) {
6113           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6114               Src1, Index0, Src0, Index1);
6115           T = lowerShuffleVector_AllFromSameSrc(
6116               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6117               UNIFIED_INDEX_1);
6118         } else {
6119           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6120               Src1, Index0, Src0, Index1);
6121           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6122               Src1, Index2, Src0, Index3);
6123           T = lowerShuffleVector_TwoFromSameSrc(
6124               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6125               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6126         }
6127       }
6128       break;
6129       CASE_SRCS_IN(1, 0, 1, 1) : {
6130         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6131                                                                   Src0, Index1);
6132         T = lowerShuffleVector_TwoFromSameSrc(
6133             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6134       }
6135       break;
6136       CASE_SRCS_IN(1, 1, 0, 0) : {
6137         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
6138                                               Index2, Index3);
6139       }
6140       break;
6141       CASE_SRCS_IN(1, 1, 0, 1) : {
6142         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6143                                                                   Src1, Index3);
6144         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6145                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6146       }
6147       break;
6148       CASE_SRCS_IN(1, 1, 1, 0) : {
6149         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6150                                                                   Src0, Index3);
6151         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6152                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6153       }
6154       break;
6155       CASE_SRCS_IN(1, 1, 1, 1) : {
6156         T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
6157                                               Index3);
6158       }
6159       break;
6160 #undef CASE_SRCS_IN
6161     }
6162 
6163     assert(T != nullptr);
6164     assert(T->getType() == DestTy);
6165     _movp(Dest, T);
6166     return;
6167   } break;
6168   }
6169 
6170   // Unoptimized shuffle. Perform a series of inserts and extracts.
6171   Context.insert<InstFakeDef>(T);
6172   const Type ElementType = typeElementType(DestTy);
6173   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
6174     auto *Index = Instr->getIndex(I);
6175     const SizeT Elem = Index->getValue();
6176     auto *ExtElmt = makeReg(ElementType);
6177     if (Elem < NumElements) {
6178       lowerExtractElement(
6179           InstExtractElement::create(Func, ExtElmt, Src0, Index));
6180     } else {
6181       lowerExtractElement(InstExtractElement::create(
6182           Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
6183     }
6184     auto *NewT = makeReg(DestTy);
6185     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
6186                                                  Ctx->getConstantInt32(I)));
6187     T = NewT;
6188   }
6189   _movp(Dest, T);
6190 }
6191 
lowerSelect(const InstSelect * Select)6192 void TargetX8632::lowerSelect(const InstSelect *Select) {
6193   Variable *Dest = Select->getDest();
6194 
6195   Operand *Condition = Select->getCondition();
6196   // Handle folding opportunities.
6197   if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
6198     assert(Producer->isDeleted());
6199     switch (BoolFolding::getProducerKind(Producer)) {
6200     default:
6201       break;
6202     case BoolFolding::PK_Icmp32:
6203     case BoolFolding::PK_Icmp64: {
6204       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
6205       return;
6206     }
6207     case BoolFolding::PK_Fcmp: {
6208       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
6209       return;
6210     }
6211     }
6212   }
6213 
6214   if (isVectorType(Dest->getType())) {
6215     lowerSelectVector(Select);
6216     return;
6217   }
6218 
6219   Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
6220   Operand *Zero = Ctx->getConstantZero(IceType_i32);
6221   _cmp(CmpResult, Zero);
6222   Operand *SrcT = Select->getTrueOperand();
6223   Operand *SrcF = Select->getFalseOperand();
6224   const BrCond Cond = CondX86::Br_ne;
6225   lowerSelectMove(Dest, Cond, SrcT, SrcF);
6226 }
6227 
lowerSelectMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)6228 void TargetX8632::lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
6229                                   Operand *SrcF) {
6230   Type DestTy = Dest->getType();
6231   if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
6232     // The cmov instruction doesn't allow 8-bit or FP operands, so we need
6233     // explicit control flow.
6234     // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
6235     auto *Label = InstX86Label::create(Func, this);
6236     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
6237     _mov(Dest, SrcT);
6238     _br(Cond, Label);
6239     SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
6240     _redefined(_mov(Dest, SrcF));
6241     Context.insert(Label);
6242     return;
6243   }
6244   // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
6245   // But if SrcT is immediate, we might be able to do better, as the cmov
6246   // instruction doesn't allow an immediate operand:
6247   // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
6248   if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
6249     std::swap(SrcT, SrcF);
6250     Cond = InstX86Base::getOppositeCondition(Cond);
6251   }
6252   if (DestTy == IceType_i64) {
6253     SrcT = legalizeUndef(SrcT);
6254     SrcF = legalizeUndef(SrcF);
6255     // Set the low portion.
6256     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6257     lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
6258     // Set the high portion.
6259     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6260     lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
6261     return;
6262   }
6263 
6264   assert(DestTy == IceType_i16 || DestTy == IceType_i32);
6265   lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
6266 }
6267 
lowerSelectIntMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)6268 void TargetX8632::lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
6269                                      Operand *SrcF) {
6270   Variable *T = nullptr;
6271   SrcF = legalize(SrcF);
6272   _mov(T, SrcF);
6273   SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
6274   _cmov(T, SrcT, Cond);
6275   _mov(Dest, T);
6276 }
6277 
lowerMove(Variable * Dest,Operand * Src,bool IsRedefinition)6278 void TargetX8632::lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition) {
6279   assert(Dest->getType() == Src->getType());
6280   assert(!Dest->isRematerializable());
6281   if (Dest->getType() == IceType_i64) {
6282     Src = legalize(Src);
6283     Operand *SrcLo = loOperand(Src);
6284     Operand *SrcHi = hiOperand(Src);
6285     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6286     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6287     Variable *T_Lo = nullptr, *T_Hi = nullptr;
6288     _mov(T_Lo, SrcLo);
6289     _redefined(_mov(DestLo, T_Lo), IsRedefinition);
6290     _mov(T_Hi, SrcHi);
6291     _redefined(_mov(DestHi, T_Hi), IsRedefinition);
6292   } else {
6293     Operand *SrcLegal;
6294     if (Dest->hasReg()) {
6295       // If Dest already has a physical register, then only basic legalization
6296       // is needed, as the source operand can be a register, immediate, or
6297       // memory.
6298       SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
6299     } else {
6300       // If Dest could be a stack operand, then RI must be a physical register
6301       // or a scalar integer immediate.
6302       SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
6303     }
6304     if (isVectorType(Dest->getType())) {
6305       _redefined(_movp(Dest, SrcLegal), IsRedefinition);
6306     } else {
6307       _redefined(_mov(Dest, SrcLegal), IsRedefinition);
6308     }
6309   }
6310 }
6311 
lowerOptimizeFcmpSelect(const InstFcmp * Fcmp,const InstSelect * Select)6312 bool TargetX8632::lowerOptimizeFcmpSelect(const InstFcmp *Fcmp,
6313                                           const InstSelect *Select) {
6314   Operand *CmpSrc0 = Fcmp->getSrc(0);
6315   Operand *CmpSrc1 = Fcmp->getSrc(1);
6316   Operand *SelectSrcT = Select->getTrueOperand();
6317   Operand *SelectSrcF = Select->getFalseOperand();
6318   Variable *SelectDest = Select->getDest();
6319 
6320   // TODO(capn): also handle swapped compare/select operand order.
6321   if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
6322     return false;
6323 
6324   // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
6325   InstFcmp::FCond Condition = Fcmp->getCondition();
6326   switch (Condition) {
6327   default:
6328     return false;
6329   case InstFcmp::True:
6330     break;
6331   case InstFcmp::False:
6332     break;
6333   case InstFcmp::Ogt: {
6334     Variable *T = makeReg(SelectDest->getType());
6335     if (isScalarFloatingType(SelectSrcT->getType())) {
6336       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6337       _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6338       _mov(SelectDest, T);
6339     } else {
6340       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6341       _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6342       _movp(SelectDest, T);
6343     }
6344     return true;
6345   } break;
6346   case InstFcmp::Olt: {
6347     Variable *T = makeReg(SelectSrcT->getType());
6348     if (isScalarFloatingType(SelectSrcT->getType())) {
6349       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6350       _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6351       _mov(SelectDest, T);
6352     } else {
6353       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6354       _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6355       _movp(SelectDest, T);
6356     }
6357     return true;
6358   } break;
6359   }
6360   return false;
6361 }
6362 
lowerIcmp(const InstIcmp * Icmp)6363 void TargetX8632::lowerIcmp(const InstIcmp *Icmp) {
6364   Variable *Dest = Icmp->getDest();
6365   if (isVectorType(Dest->getType())) {
6366     lowerIcmpVector(Icmp);
6367   } else {
6368     constexpr Inst *Consumer = nullptr;
6369     lowerIcmpAndConsumer(Icmp, Consumer);
6370   }
6371 }
6372 
lowerSelectVector(const InstSelect * Instr)6373 void TargetX8632::lowerSelectVector(const InstSelect *Instr) {
6374   Variable *Dest = Instr->getDest();
6375   Type DestTy = Dest->getType();
6376   Operand *SrcT = Instr->getTrueOperand();
6377   Operand *SrcF = Instr->getFalseOperand();
6378   Operand *Condition = Instr->getCondition();
6379 
6380   if (!isVectorType(DestTy))
6381     llvm::report_fatal_error("Expected a vector select");
6382 
6383   Type SrcTy = SrcT->getType();
6384   Variable *T = makeReg(SrcTy);
6385   Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
6386   Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
6387 
6388   if (InstructionSet >= SSE4_1) {
6389     // TODO(wala): If the condition operand is a constant, use blendps or
6390     // pblendw.
6391     //
6392     // Use blendvps or pblendvb to implement select.
6393     if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
6394         SrcTy == IceType_v4f32) {
6395       Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6396       Variable *xmm0 = makeReg(IceType_v4i32, RegX8632::Reg_xmm0);
6397       _movp(xmm0, ConditionRM);
6398       _psll(xmm0, Ctx->getConstantInt8(31));
6399       _movp(T, SrcFRM);
6400       _blendvps(T, SrcTRM, xmm0);
6401       _movp(Dest, T);
6402     } else {
6403       assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
6404       Type SignExtTy =
6405           Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
6406       Variable *xmm0 = makeReg(SignExtTy, RegX8632::Reg_xmm0);
6407       lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
6408       _movp(T, SrcFRM);
6409       _pblendvb(T, SrcTRM, xmm0);
6410       _movp(Dest, T);
6411     }
6412     return;
6413   }
6414   // Lower select without SSE4.1:
6415   // a=d?b:c ==>
6416   //   if elementtype(d) != i1:
6417   //      d=sext(d);
6418   //   a=(b&d)|(c&~d);
6419   Variable *T2 = makeReg(SrcTy);
6420   // Sign extend the condition operand if applicable.
6421   if (SrcTy == IceType_v4f32) {
6422     // The sext operation takes only integer arguments.
6423     Variable *T3 = Func->makeVariable(IceType_v4i32);
6424     lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
6425     _movp(T, T3);
6426   } else if (typeElementType(SrcTy) != IceType_i1) {
6427     lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
6428   } else {
6429     Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6430     _movp(T, ConditionRM);
6431   }
6432   _movp(T2, T);
6433   _pand(T, SrcTRM);
6434   _pandn(T2, SrcFRM);
6435   _por(T, T2);
6436   _movp(Dest, T);
6437 
6438   return;
6439 }
6440 
lowerStore(const InstStore * Instr)6441 void TargetX8632::lowerStore(const InstStore *Instr) {
6442   Operand *Value = Instr->getData();
6443   Operand *Addr = Instr->getStoreAddress();
6444   X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
6445   doMockBoundsCheck(NewAddr);
6446   Type Ty = NewAddr->getType();
6447 
6448   if (Ty == IceType_i64) {
6449     Value = legalizeUndef(Value);
6450     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
6451     _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
6452     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
6453     _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
6454   } else if (isVectorType(Ty)) {
6455     _storep(legalizeToReg(Value), NewAddr);
6456   } else {
6457     Value = legalize(Value, Legal_Reg | Legal_Imm);
6458     _store(Value, NewAddr);
6459   }
6460 }
6461 
doAddressOptStore()6462 void TargetX8632::doAddressOptStore() {
6463   auto *Instr = llvm::cast<InstStore>(Context.getCur());
6464   Operand *Addr = Instr->getStoreAddress();
6465   Operand *Data = Instr->getData();
6466   if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
6467     Instr->setDeleted();
6468     auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
6469     if (Instr->getDest())
6470       NewStore->setRmwBeacon(Instr->getRmwBeacon());
6471   }
6472 }
6473 
doAddressOptStoreSubVector()6474 void TargetX8632::doAddressOptStoreSubVector() {
6475   auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
6476   Operand *Addr = Intrinsic->getArg(1);
6477   Operand *Data = Intrinsic->getArg(0);
6478   if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
6479     Intrinsic->setDeleted();
6480     const Ice::Intrinsics::IntrinsicInfo Info = {
6481         Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
6482         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
6483     auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
6484     NewStore->addArg(Data);
6485     NewStore->addArg(OptAddr);
6486     NewStore->addArg(Intrinsic->getArg(2));
6487   }
6488 }
6489 
lowerCmpRange(Operand * Comparison,uint64_t Min,uint64_t Max)6490 Operand *TargetX8632::lowerCmpRange(Operand *Comparison, uint64_t Min,
6491                                     uint64_t Max) {
6492   // TODO(ascull): 64-bit should not reach here but only because it is not
6493   // implemented yet. This should be able to handle the 64-bit case.
6494   assert(Comparison->getType() != IceType_i64);
6495   // Subtracting 0 is a nop so don't do it
6496   if (Min != 0) {
6497     // Avoid clobbering the comparison by copying it
6498     Variable *T = nullptr;
6499     _mov(T, Comparison);
6500     _sub(T, Ctx->getConstantInt32(Min));
6501     Comparison = T;
6502   }
6503 
6504   _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
6505 
6506   return Comparison;
6507 }
6508 
lowerCaseCluster(const CaseCluster & Case,Operand * Comparison,bool DoneCmp,CfgNode * DefaultTarget)6509 void TargetX8632::lowerCaseCluster(const CaseCluster &Case, Operand *Comparison,
6510                                    bool DoneCmp, CfgNode *DefaultTarget) {
6511   switch (Case.getKind()) {
6512   case CaseCluster::JumpTable: {
6513     InstX86Label *SkipJumpTable;
6514 
6515     Operand *RangeIndex =
6516         lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
6517     if (DefaultTarget == nullptr) {
6518       // Skip over jump table logic if comparison not in range and no default
6519       SkipJumpTable = InstX86Label::create(Func, this);
6520       _br(CondX86::Br_a, SkipJumpTable);
6521     } else {
6522       _br(CondX86::Br_a, DefaultTarget);
6523     }
6524 
6525     InstJumpTable *JumpTable = Case.getJumpTable();
6526     Context.insert(JumpTable);
6527 
6528     // Make sure the index is a register of the same width as the base
6529     Variable *Index;
6530     const Type PointerType = IceType_i32;
6531     if (RangeIndex->getType() != PointerType) {
6532       Index = makeReg(PointerType);
6533       assert(RangeIndex->getType() != IceType_i64);
6534       Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
6535       _movzx(Index, RangeIndexRM);
6536     } else {
6537       Index = legalizeToReg(RangeIndex);
6538     }
6539 
6540     constexpr RelocOffsetT RelocOffset = 0;
6541     constexpr Variable *NoBase = nullptr;
6542     constexpr Constant *NoOffset = nullptr;
6543     auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
6544     Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
6545     uint16_t Shift = typeWidthInBytesLog2(PointerType);
6546     constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
6547 
6548     Variable *Target = nullptr;
6549     if (PointerType == IceType_i32) {
6550       _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
6551                                          Index, Shift, Segment));
6552     } else {
6553       auto *Base = makeReg(IceType_i64);
6554       _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
6555       _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
6556                                          Index, Shift, Segment));
6557     }
6558 
6559     lowerIndirectJump(Target);
6560 
6561     if (DefaultTarget == nullptr)
6562       Context.insert(SkipJumpTable);
6563     return;
6564   }
6565   case CaseCluster::Range: {
6566     if (Case.isUnitRange()) {
6567       // Single item
6568       if (!DoneCmp) {
6569         Constant *Value = Ctx->getConstantInt32(Case.getLow());
6570         _cmp(Comparison, Value);
6571       }
6572       _br(CondX86::Br_e, Case.getTarget());
6573     } else if (DoneCmp && Case.isPairRange()) {
6574       // Range of two items with first item aleady compared against
6575       _br(CondX86::Br_e, Case.getTarget());
6576       Constant *Value = Ctx->getConstantInt32(Case.getHigh());
6577       _cmp(Comparison, Value);
6578       _br(CondX86::Br_e, Case.getTarget());
6579     } else {
6580       // Range
6581       lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
6582       _br(CondX86::Br_be, Case.getTarget());
6583     }
6584     if (DefaultTarget != nullptr)
6585       _br(DefaultTarget);
6586     return;
6587   }
6588   }
6589 }
6590 
lowerSwitch(const InstSwitch * Instr)6591 void TargetX8632::lowerSwitch(const InstSwitch *Instr) {
6592   // Group cases together and navigate through them with a binary search
6593   CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
6594   Operand *Src0 = Instr->getComparison();
6595   CfgNode *DefaultTarget = Instr->getLabelDefault();
6596 
6597   assert(CaseClusters.size() != 0); // Should always be at least one
6598 
6599   if (Src0->getType() == IceType_i64) {
6600     Src0 = legalize(Src0); // get Base/Index into physical registers
6601     Operand *Src0Lo = loOperand(Src0);
6602     Operand *Src0Hi = hiOperand(Src0);
6603     if (CaseClusters.back().getHigh() > UINT32_MAX) {
6604       // TODO(ascull): handle 64-bit case properly (currently naive version)
6605       // This might be handled by a higher level lowering of switches.
6606       SizeT NumCases = Instr->getNumCases();
6607       if (NumCases >= 2) {
6608         Src0Lo = legalizeToReg(Src0Lo);
6609         Src0Hi = legalizeToReg(Src0Hi);
6610       } else {
6611         Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
6612         Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
6613       }
6614       for (SizeT I = 0; I < NumCases; ++I) {
6615         Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
6616         Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
6617         InstX86Label *Label = InstX86Label::create(Func, this);
6618         _cmp(Src0Lo, ValueLo);
6619         _br(CondX86::Br_ne, Label);
6620         _cmp(Src0Hi, ValueHi);
6621         _br(CondX86::Br_e, Instr->getLabel(I));
6622         Context.insert(Label);
6623       }
6624       _br(Instr->getLabelDefault());
6625       return;
6626     } else {
6627       // All the values are 32-bit so just check the operand is too and then
6628       // fall through to the 32-bit implementation. This is a common case.
6629       Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
6630       Constant *Zero = Ctx->getConstantInt32(0);
6631       _cmp(Src0Hi, Zero);
6632       _br(CondX86::Br_ne, DefaultTarget);
6633       Src0 = Src0Lo;
6634     }
6635   }
6636 
6637   // 32-bit lowering
6638 
6639   if (CaseClusters.size() == 1) {
6640     // Jump straight to default if needed. Currently a common case as jump
6641     // tables occur on their own.
6642     constexpr bool DoneCmp = false;
6643     lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
6644     return;
6645   }
6646 
6647   // Going to be using multiple times so get it in a register early
6648   Variable *Comparison = legalizeToReg(Src0);
6649 
6650   // A span is over the clusters
6651   struct SearchSpan {
6652     SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
6653         : Begin(Begin), Size(Size), Label(Label) {}
6654 
6655     SizeT Begin;
6656     SizeT Size;
6657     InstX86Label *Label;
6658   };
6659   // The stack will only grow to the height of the tree so 12 should be plenty
6660   std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
6661   SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
6662   bool DoneCmp = false;
6663 
6664   while (!SearchSpanStack.empty()) {
6665     SearchSpan Span = SearchSpanStack.top();
6666     SearchSpanStack.pop();
6667 
6668     if (Span.Label != nullptr)
6669       Context.insert(Span.Label);
6670 
6671     switch (Span.Size) {
6672     case 0:
6673       llvm::report_fatal_error("Invalid SearchSpan size");
6674       break;
6675 
6676     case 1:
6677       lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
6678                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
6679       DoneCmp = false;
6680       break;
6681 
6682     case 2: {
6683       const CaseCluster *CaseA = &CaseClusters[Span.Begin];
6684       const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
6685 
6686       // Placing a range last may allow register clobbering during the range
6687       // test. That means there is no need to clone the register. If it is a
6688       // unit range the comparison may have already been done in the binary
6689       // search (DoneCmp) and so it should be placed first. If this is a range
6690       // of two items and the comparison with the low value has already been
6691       // done, comparing with the other element is cheaper than a range test.
6692       // If the low end of the range is zero then there is no subtraction and
6693       // nothing to be gained.
6694       if (!CaseA->isUnitRange() &&
6695           !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
6696         std::swap(CaseA, CaseB);
6697         DoneCmp = false;
6698       }
6699 
6700       lowerCaseCluster(*CaseA, Comparison, DoneCmp);
6701       DoneCmp = false;
6702       lowerCaseCluster(*CaseB, Comparison, DoneCmp,
6703                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
6704     } break;
6705 
6706     default:
6707       // Pick the middle item and branch b or ae
6708       SizeT PivotIndex = Span.Begin + (Span.Size / 2);
6709       const CaseCluster &Pivot = CaseClusters[PivotIndex];
6710       Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
6711       InstX86Label *Label = InstX86Label::create(Func, this);
6712       _cmp(Comparison, Value);
6713       // TODO(ascull): does it alway have to be far?
6714       _br(CondX86::Br_b, Label, InstX86Br::Far);
6715       // Lower the left and (pivot+right) sides, falling through to the right
6716       SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
6717       SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
6718       DoneCmp = true;
6719       break;
6720     }
6721   }
6722 
6723   _br(DefaultTarget);
6724 }
6725 
6726 /// The following pattern occurs often in lowered C and C++ code:
6727 ///
6728 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
6729 ///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
6730 ///
6731 /// We can eliminate the sext operation by copying the result of pcmpeqd,
6732 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of
6733 /// the sext operation.
6734 
eliminateNextVectorSextInstruction(Variable * SignExtendedResult)6735 void TargetX8632::eliminateNextVectorSextInstruction(
6736     Variable *SignExtendedResult) {
6737   if (auto *NextCast =
6738           llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
6739     if (NextCast->getCastKind() == InstCast::Sext &&
6740         NextCast->getSrc(0) == SignExtendedResult) {
6741       NextCast->setDeleted();
6742       _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
6743       // Skip over the instruction.
6744       Context.advanceNext();
6745     }
6746   }
6747 }
6748 
lowerUnreachable(const InstUnreachable *)6749 void TargetX8632::lowerUnreachable(const InstUnreachable * /*Instr*/) {
6750   _ud2();
6751   // Add a fake use of esp to make sure esp adjustments after the unreachable
6752   // do not get dead-code eliminated.
6753   keepEspLiveAtExit();
6754 }
6755 
lowerBreakpoint(const InstBreakpoint *)6756 void TargetX8632::lowerBreakpoint(const InstBreakpoint * /*Instr*/) { _int3(); }
6757 
lowerRMW(const InstX86FakeRMW * RMW)6758 void TargetX8632::lowerRMW(const InstX86FakeRMW *RMW) {
6759   // If the beacon variable's live range does not end in this instruction,
6760   // then it must end in the modified Store instruction that follows. This
6761   // means that the original Store instruction is still there, either because
6762   // the value being stored is used beyond the Store instruction, or because
6763   // dead code elimination did not happen. In either case, we cancel RMW
6764   // lowering (and the caller deletes the RMW instruction).
6765   if (!RMW->isLastUse(RMW->getBeacon()))
6766     return;
6767   Operand *Src = RMW->getData();
6768   Type Ty = Src->getType();
6769   X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
6770   doMockBoundsCheck(Addr);
6771   if (Ty == IceType_i64) {
6772     Src = legalizeUndef(Src);
6773     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
6774     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
6775     auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
6776     auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
6777     switch (RMW->getOp()) {
6778     default:
6779       // TODO(stichnot): Implement other arithmetic operators.
6780       break;
6781     case InstArithmetic::Add:
6782       _add_rmw(AddrLo, SrcLo);
6783       _adc_rmw(AddrHi, SrcHi);
6784       return;
6785     case InstArithmetic::Sub:
6786       _sub_rmw(AddrLo, SrcLo);
6787       _sbb_rmw(AddrHi, SrcHi);
6788       return;
6789     case InstArithmetic::And:
6790       _and_rmw(AddrLo, SrcLo);
6791       _and_rmw(AddrHi, SrcHi);
6792       return;
6793     case InstArithmetic::Or:
6794       _or_rmw(AddrLo, SrcLo);
6795       _or_rmw(AddrHi, SrcHi);
6796       return;
6797     case InstArithmetic::Xor:
6798       _xor_rmw(AddrLo, SrcLo);
6799       _xor_rmw(AddrHi, SrcHi);
6800       return;
6801     }
6802   } else {
6803     // x86-32: i8, i16, i32
6804     // x86-64: i8, i16, i32, i64
6805     switch (RMW->getOp()) {
6806     default:
6807       // TODO(stichnot): Implement other arithmetic operators.
6808       break;
6809     case InstArithmetic::Add:
6810       Src = legalize(Src, Legal_Reg | Legal_Imm);
6811       _add_rmw(Addr, Src);
6812       return;
6813     case InstArithmetic::Sub:
6814       Src = legalize(Src, Legal_Reg | Legal_Imm);
6815       _sub_rmw(Addr, Src);
6816       return;
6817     case InstArithmetic::And:
6818       Src = legalize(Src, Legal_Reg | Legal_Imm);
6819       _and_rmw(Addr, Src);
6820       return;
6821     case InstArithmetic::Or:
6822       Src = legalize(Src, Legal_Reg | Legal_Imm);
6823       _or_rmw(Addr, Src);
6824       return;
6825     case InstArithmetic::Xor:
6826       Src = legalize(Src, Legal_Reg | Legal_Imm);
6827       _xor_rmw(Addr, Src);
6828       return;
6829     }
6830   }
6831   llvm::report_fatal_error("Couldn't lower RMW instruction");
6832 }
6833 
lowerOther(const Inst * Instr)6834 void TargetX8632::lowerOther(const Inst *Instr) {
6835   if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
6836     lowerRMW(RMW);
6837   } else {
6838     TargetLowering::lowerOther(Instr);
6839   }
6840 }
6841 
6842 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
6843 /// preserve integrity of liveness analysis. Undef values are also turned into
6844 /// zeroes, since loOperand() and hiOperand() don't expect Undef input.
prelowerPhis()6845 void TargetX8632::prelowerPhis() {
6846   PhiLowering::prelowerPhis32Bit<TargetX8632>(this, Context.getNode(), Func);
6847 }
6848 
genTargetHelperCallFor(Inst * Instr)6849 void TargetX8632::genTargetHelperCallFor(Inst *Instr) {
6850   uint32_t StackArgumentsSize = 0;
6851   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
6852     RuntimeHelper HelperID = RuntimeHelper::H_Num;
6853     Variable *Dest = Arith->getDest();
6854     Type DestTy = Dest->getType();
6855     if (DestTy == IceType_i64) {
6856       switch (Arith->getOp()) {
6857       default:
6858         return;
6859       case InstArithmetic::Udiv:
6860         HelperID = RuntimeHelper::H_udiv_i64;
6861         break;
6862       case InstArithmetic::Sdiv:
6863         HelperID = RuntimeHelper::H_sdiv_i64;
6864         break;
6865       case InstArithmetic::Urem:
6866         HelperID = RuntimeHelper::H_urem_i64;
6867         break;
6868       case InstArithmetic::Srem:
6869         HelperID = RuntimeHelper::H_srem_i64;
6870         break;
6871       }
6872     } else if (isVectorType(DestTy)) {
6873       Variable *Dest = Arith->getDest();
6874       Operand *Src0 = Arith->getSrc(0);
6875       Operand *Src1 = Arith->getSrc(1);
6876       switch (Arith->getOp()) {
6877       default:
6878         return;
6879       case InstArithmetic::Mul:
6880         if (DestTy == IceType_v16i8) {
6881           scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6882           Arith->setDeleted();
6883         }
6884         return;
6885       case InstArithmetic::Shl:
6886       case InstArithmetic::Lshr:
6887       case InstArithmetic::Ashr:
6888         if (llvm::isa<Constant>(Src1)) {
6889           return;
6890         }
6891       case InstArithmetic::Udiv:
6892       case InstArithmetic::Urem:
6893       case InstArithmetic::Sdiv:
6894       case InstArithmetic::Srem:
6895       case InstArithmetic::Frem:
6896         scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6897         Arith->setDeleted();
6898         return;
6899       }
6900     } else {
6901       switch (Arith->getOp()) {
6902       default:
6903         return;
6904       case InstArithmetic::Frem:
6905         if (isFloat32Asserting32Or64(DestTy))
6906           HelperID = RuntimeHelper::H_frem_f32;
6907         else
6908           HelperID = RuntimeHelper::H_frem_f64;
6909       }
6910     }
6911     constexpr SizeT MaxSrcs = 2;
6912     InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
6913     Call->addArg(Arith->getSrc(0));
6914     Call->addArg(Arith->getSrc(1));
6915     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6916     Context.insert(Call);
6917     Arith->setDeleted();
6918   } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
6919     InstCast::OpKind CastKind = Cast->getCastKind();
6920     Operand *Src0 = Cast->getSrc(0);
6921     const Type SrcType = Src0->getType();
6922     Variable *Dest = Cast->getDest();
6923     const Type DestTy = Dest->getType();
6924     RuntimeHelper HelperID = RuntimeHelper::H_Num;
6925     Variable *CallDest = Dest;
6926     switch (CastKind) {
6927     default:
6928       return;
6929     case InstCast::Fptosi:
6930       if (DestTy == IceType_i64) {
6931         HelperID = isFloat32Asserting32Or64(SrcType)
6932                        ? RuntimeHelper::H_fptosi_f32_i64
6933                        : RuntimeHelper::H_fptosi_f64_i64;
6934       } else {
6935         return;
6936       }
6937       break;
6938     case InstCast::Fptoui:
6939       if (isVectorType(DestTy)) {
6940         assert(DestTy == IceType_v4i32);
6941         assert(SrcType == IceType_v4f32);
6942         HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
6943       } else if (DestTy == IceType_i64 || DestTy == IceType_i32) {
6944         if (isInt32Asserting32Or64(DestTy)) {
6945           HelperID = isFloat32Asserting32Or64(SrcType)
6946                          ? RuntimeHelper::H_fptoui_f32_i32
6947                          : RuntimeHelper::H_fptoui_f64_i32;
6948         } else {
6949           HelperID = isFloat32Asserting32Or64(SrcType)
6950                          ? RuntimeHelper::H_fptoui_f32_i64
6951                          : RuntimeHelper::H_fptoui_f64_i64;
6952         }
6953       } else {
6954         return;
6955       }
6956       break;
6957     case InstCast::Sitofp:
6958       if (SrcType == IceType_i64) {
6959         HelperID = isFloat32Asserting32Or64(DestTy)
6960                        ? RuntimeHelper::H_sitofp_i64_f32
6961                        : RuntimeHelper::H_sitofp_i64_f64;
6962       } else {
6963         return;
6964       }
6965       break;
6966     case InstCast::Uitofp:
6967       if (isVectorType(SrcType)) {
6968         assert(DestTy == IceType_v4f32);
6969         assert(SrcType == IceType_v4i32);
6970         HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
6971       } else if (SrcType == IceType_i64 || SrcType == IceType_i32) {
6972         if (isInt32Asserting32Or64(SrcType)) {
6973           HelperID = isFloat32Asserting32Or64(DestTy)
6974                          ? RuntimeHelper::H_uitofp_i32_f32
6975                          : RuntimeHelper::H_uitofp_i32_f64;
6976         } else {
6977           HelperID = isFloat32Asserting32Or64(DestTy)
6978                          ? RuntimeHelper::H_uitofp_i64_f32
6979                          : RuntimeHelper::H_uitofp_i64_f64;
6980         }
6981       } else {
6982         return;
6983       }
6984       break;
6985     case InstCast::Bitcast: {
6986       if (DestTy == Src0->getType())
6987         return;
6988       switch (DestTy) {
6989       default:
6990         return;
6991       case IceType_i8:
6992         assert(Src0->getType() == IceType_v8i1);
6993         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
6994         CallDest = Func->makeVariable(IceType_i32);
6995         break;
6996       case IceType_i16:
6997         assert(Src0->getType() == IceType_v16i1);
6998         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
6999         CallDest = Func->makeVariable(IceType_i32);
7000         break;
7001       case IceType_v8i1: {
7002         assert(Src0->getType() == IceType_i8);
7003         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
7004         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7005         // Arguments to functions are required to be at least 32 bits wide.
7006         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7007         Src0 = Src0AsI32;
7008       } break;
7009       case IceType_v16i1: {
7010         assert(Src0->getType() == IceType_i16);
7011         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
7012         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7013         // Arguments to functions are required to be at least 32 bits wide.
7014         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7015         Src0 = Src0AsI32;
7016       } break;
7017       }
7018     } break;
7019     }
7020     constexpr SizeT MaxSrcs = 1;
7021     InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
7022     Call->addArg(Src0);
7023     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7024     Context.insert(Call);
7025     // The PNaCl ABI disallows i8/i16 return types, so truncate the helper
7026     // call result to the appropriate type as necessary.
7027     if (CallDest->getType() != Dest->getType())
7028       Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
7029     Cast->setDeleted();
7030   } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
7031     CfgVector<Type> ArgTypes;
7032     Type ReturnType = IceType_void;
7033     switch (Intrinsic->getIntrinsicID()) {
7034     default:
7035       return;
7036     case Intrinsics::Ctpop: {
7037       Operand *Val = Intrinsic->getArg(0);
7038       Type ValTy = Val->getType();
7039       if (ValTy == IceType_i64)
7040         ArgTypes = {IceType_i64};
7041       else
7042         ArgTypes = {IceType_i32};
7043       ReturnType = IceType_i32;
7044     } break;
7045     case Intrinsics::Longjmp:
7046       ArgTypes = {IceType_i32, IceType_i32};
7047       ReturnType = IceType_void;
7048       break;
7049     case Intrinsics::Memcpy:
7050       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7051       ReturnType = IceType_void;
7052       break;
7053     case Intrinsics::Memmove:
7054       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7055       ReturnType = IceType_void;
7056       break;
7057     case Intrinsics::Memset:
7058       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7059       ReturnType = IceType_void;
7060       break;
7061     case Intrinsics::Setjmp:
7062       ArgTypes = {IceType_i32};
7063       ReturnType = IceType_i32;
7064       break;
7065     }
7066     StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7067   } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
7068     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7069   } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
7070     if (!Ret->hasRetValue())
7071       return;
7072     Operand *RetValue = Ret->getRetValue();
7073     Type ReturnType = RetValue->getType();
7074     if (!isScalarFloatingType(ReturnType))
7075       return;
7076     StackArgumentsSize = typeWidthInBytes(ReturnType);
7077   } else {
7078     return;
7079   }
7080   StackArgumentsSize = applyStackAlignment(StackArgumentsSize);
7081   updateMaxOutArgsSizeBytes(StackArgumentsSize);
7082 }
7083 
7084 uint32_t
getCallStackArgumentsSizeBytes(const CfgVector<Type> & ArgTypes,Type ReturnType)7085 TargetX8632::getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
7086                                             Type ReturnType) {
7087   uint32_t OutArgumentsSizeBytes = 0;
7088   uint32_t XmmArgCount = 0;
7089   uint32_t GprArgCount = 0;
7090   for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
7091     Type Ty = ArgTypes[i];
7092     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
7093     assert(typeWidthInBytes(Ty) >= 4);
7094     if (isVectorType(Ty) &&
7095         RegX8632::getRegisterForXmmArgNum(RegX8632::getArgIndex(i, XmmArgCount))
7096             .hasValue()) {
7097       ++XmmArgCount;
7098     } else if (isScalarIntegerType(Ty) &&
7099                RegX8632::getRegisterForGprArgNum(
7100                    Ty, RegX8632::getArgIndex(i, GprArgCount))
7101                    .hasValue()) {
7102       // The 64 bit ABI allows some integers to be passed in GPRs.
7103       ++GprArgCount;
7104     } else {
7105       if (isVectorType(Ty)) {
7106         OutArgumentsSizeBytes = applyStackAlignment(OutArgumentsSizeBytes);
7107       }
7108       OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
7109     }
7110   }
7111   // The 32 bit ABI requires floating point values to be returned on the x87
7112   // FP stack. Ensure there is enough space for the fstp/movs for floating
7113   // returns.
7114   if (isScalarFloatingType(ReturnType)) {
7115     OutArgumentsSizeBytes =
7116         std::max(OutArgumentsSizeBytes,
7117                  static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
7118   }
7119   return OutArgumentsSizeBytes;
7120 }
7121 
getCallStackArgumentsSizeBytes(const InstCall * Instr)7122 uint32_t TargetX8632::getCallStackArgumentsSizeBytes(const InstCall *Instr) {
7123   // Build a vector of the arguments' types.
7124   const SizeT NumArgs = Instr->getNumArgs();
7125   CfgVector<Type> ArgTypes;
7126   ArgTypes.reserve(NumArgs);
7127   for (SizeT i = 0; i < NumArgs; ++i) {
7128     Operand *Arg = Instr->getArg(i);
7129     ArgTypes.emplace_back(Arg->getType());
7130   }
7131   // Compute the return type (if any);
7132   Type ReturnType = IceType_void;
7133   Variable *Dest = Instr->getDest();
7134   if (Dest != nullptr)
7135     ReturnType = Dest->getType();
7136   return getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7137 }
7138 
makeZeroedRegister(Type Ty,RegNumT RegNum)7139 Variable *TargetX8632::makeZeroedRegister(Type Ty, RegNumT RegNum) {
7140   Variable *Reg = makeReg(Ty, RegNum);
7141   switch (Ty) {
7142   case IceType_i1:
7143   case IceType_i8:
7144   case IceType_i16:
7145   case IceType_i32:
7146   case IceType_i64:
7147     // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
7148     _mov(Reg, Ctx->getConstantZero(Ty));
7149     break;
7150   case IceType_f32:
7151   case IceType_f64:
7152     Context.insert<InstFakeDef>(Reg);
7153     _xorps(Reg, Reg);
7154     break;
7155   default:
7156     // All vector types use the same pxor instruction.
7157     assert(isVectorType(Ty));
7158     Context.insert<InstFakeDef>(Reg);
7159     _pxor(Reg, Reg);
7160     break;
7161   }
7162   return Reg;
7163 }
7164 
7165 // There is no support for loading or emitting vector constants, so the vector
7166 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
7167 // initialized with register operations.
7168 //
7169 // TODO(wala): Add limited support for vector constants so that complex
7170 // initialization in registers is unnecessary.
7171 
makeVectorOfZeros(Type Ty,RegNumT RegNum)7172 Variable *TargetX8632::makeVectorOfZeros(Type Ty, RegNumT RegNum) {
7173   return makeZeroedRegister(Ty, RegNum);
7174 }
7175 
makeVectorOfMinusOnes(Type Ty,RegNumT RegNum)7176 Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, RegNumT RegNum) {
7177   Variable *MinusOnes = makeReg(Ty, RegNum);
7178   // Insert a FakeDef so the live range of MinusOnes is not overestimated.
7179   Context.insert<InstFakeDef>(MinusOnes);
7180   if (Ty == IceType_f64)
7181     // Making a vector of minus ones of type f64 is currently only used for
7182     // the fabs intrinsic.  To use the f64 type to create this mask with
7183     // pcmpeqq requires SSE 4.1.  Since we're just creating a mask, pcmpeqd
7184     // does the same job and only requires SSE2.
7185     _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
7186   else
7187     _pcmpeq(MinusOnes, MinusOnes);
7188   return MinusOnes;
7189 }
7190 
makeVectorOfOnes(Type Ty,RegNumT RegNum)7191 Variable *TargetX8632::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
7192   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
7193   Variable *MinusOne = makeVectorOfMinusOnes(Ty);
7194   _psub(Dest, MinusOne);
7195   return Dest;
7196 }
7197 
makeVectorOfHighOrderBits(Type Ty,RegNumT RegNum)7198 Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum) {
7199   assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
7200          Ty == IceType_v16i8);
7201   if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
7202     Variable *Reg = makeVectorOfOnes(Ty, RegNum);
7203     SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
7204     _psll(Reg, Ctx->getConstantInt8(Shift));
7205     return Reg;
7206   } else {
7207     // SSE has no left shift operation for vectors of 8 bit integers.
7208     constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
7209     Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
7210     Variable *Reg = makeReg(Ty, RegNum);
7211     _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
7212     _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
7213     return Reg;
7214   }
7215 }
7216 
7217 /// Construct a mask in a register that can be and'ed with a floating-point
7218 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
7219 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
7220 /// ones logically right shifted one bit.
7221 // TODO(stichnot): Fix the wala
7222 // TODO: above, to represent vector constants in memory.
7223 
makeVectorOfFabsMask(Type Ty,RegNumT RegNum)7224 Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, RegNumT RegNum) {
7225   Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
7226   _psrl(Reg, Ctx->getConstantInt8(1));
7227   return Reg;
7228 }
7229 
getMemoryOperandForStackSlot(Type Ty,Variable * Slot,uint32_t Offset)7230 X86OperandMem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
7231                                                          Variable *Slot,
7232                                                          uint32_t Offset) {
7233   // Ensure that Loc is a stack slot.
7234   assert(Slot->mustNotHaveReg());
7235   assert(Slot->getRegNum().hasNoValue());
7236   // Compute the location of Loc in memory.
7237   // TODO(wala,stichnot): lea should not
7238   // be required. The address of the stack slot is known at compile time
7239   // (although not until after addProlog()).
7240   const Type PointerType = IceType_i32;
7241   Variable *Loc = makeReg(PointerType);
7242   _lea(Loc, Slot);
7243   Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
7244   return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
7245 }
7246 
7247 /// Lowering helper to copy a scalar integer source operand into some 8-bit
7248 /// GPR. Src is assumed to already be legalized.  If the source operand is
7249 /// known to be a memory or immediate operand, a simple mov will suffice.  But
7250 /// if the source operand can be a physical register, then it must first be
7251 /// copied into a physical register that is truncable to 8-bit, then truncated
7252 /// into a physical register that can receive a truncation, and finally copied
7253 /// into the result 8-bit register (which in general can be any 8-bit
7254 /// register).  For example, moving %ebp into %ah may be accomplished as:
7255 ///   movl %ebp, %edx
7256 ///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
7257 ///   movb %dl, %ah
7258 /// On the other hand, moving a memory or immediate operand into ah:
7259 ///   movb 4(%ebp), %ah
7260 ///   movb $my_imm, %ah
7261 ///
7262 /// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
7263 /// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
7264 /// use RegNum=RegNumT() and then let the caller do a separate copy into
7265 /// Reg_ah.
7266 ///
7267 /// Note #2.  ConstantRelocatable operands are also put through this process
7268 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
7269 /// but not R_386_8 relocations.
7270 ///
7271 /// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
7272 /// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
7273 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid)
7274 /// argument to the pinsrb instruction.
7275 
copyToReg8(Operand * Src,RegNumT RegNum)7276 Variable *TargetX8632::copyToReg8(Operand *Src, RegNumT RegNum) {
7277   Type Ty = Src->getType();
7278   assert(isScalarIntegerType(Ty));
7279   assert(Ty != IceType_i1);
7280   Variable *Reg = makeReg(IceType_i8, RegNum);
7281   Reg->setRegClass(RCX86_IsTrunc8Rcvr);
7282   if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
7283     Variable *SrcTruncable = makeReg(Ty);
7284     switch (Ty) {
7285     case IceType_i64:
7286       SrcTruncable->setRegClass(RCX86_Is64To8);
7287       break;
7288     case IceType_i32:
7289       SrcTruncable->setRegClass(RCX86_Is32To8);
7290       break;
7291     case IceType_i16:
7292       SrcTruncable->setRegClass(RCX86_Is16To8);
7293       break;
7294     default:
7295       // i8 - just use default register class
7296       break;
7297     }
7298     Variable *SrcRcvr = makeReg(IceType_i8);
7299     SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
7300     _mov(SrcTruncable, Src);
7301     _mov(SrcRcvr, SrcTruncable);
7302     Src = SrcRcvr;
7303   }
7304   _mov(Reg, Src);
7305   return Reg;
7306 }
7307 
7308 /// Helper for legalize() to emit the right code to lower an operand to a
7309 /// register of the appropriate type.
7310 
copyToReg(Operand * Src,RegNumT RegNum)7311 Variable *TargetX8632::copyToReg(Operand *Src, RegNumT RegNum) {
7312   Type Ty = Src->getType();
7313   Variable *Reg = makeReg(Ty, RegNum);
7314   if (isVectorType(Ty)) {
7315     _movp(Reg, Src);
7316   } else {
7317     _mov(Reg, Src);
7318   }
7319   return Reg;
7320 }
7321 
legalize(Operand * From,LegalMask Allowed,RegNumT RegNum)7322 Operand *TargetX8632::legalize(Operand *From, LegalMask Allowed,
7323                                RegNumT RegNum) {
7324   const Type Ty = From->getType();
7325   // Assert that a physical register is allowed. To date, all calls to
7326   // legalize() allow a physical register. If a physical register needs to be
7327   // explicitly disallowed, then new code will need to be written to force a
7328   // spill.
7329   assert(Allowed & Legal_Reg);
7330   // If we're asking for a specific physical register, make sure we're not
7331   // allowing any other operand kinds. (This could be future work, e.g. allow
7332   // the shl shift amount to be either an immediate or in ecx.)
7333   assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
7334 
7335   // Substitute with an available infinite-weight variable if possible.  Only
7336   // do this when we are not asking for a specific register, and when the
7337   // substitution is not locked to a specific register, and when the types
7338   // match, in order to capture the vast majority of opportunities and avoid
7339   // corner cases in the lowering.
7340   if (RegNum.hasNoValue()) {
7341     if (Variable *Subst = getContext().availabilityGet(From)) {
7342       // At this point we know there is a potential substitution available.
7343       if (Subst->mustHaveReg() && !Subst->hasReg()) {
7344         // At this point we know the substitution will have a register.
7345         if (From->getType() == Subst->getType()) {
7346           // At this point we know the substitution's register is compatible.
7347           return Subst;
7348         }
7349       }
7350     }
7351   }
7352 
7353   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
7354     // Before doing anything with a Mem operand, we need to ensure that the
7355     // Base and Index components are in physical registers.
7356     Variable *Base = Mem->getBase();
7357     Variable *Index = Mem->getIndex();
7358     Constant *Offset = Mem->getOffset();
7359     Variable *RegBase = nullptr;
7360     Variable *RegIndex = nullptr;
7361     uint16_t Shift = Mem->getShift();
7362     if (Base) {
7363       RegBase = llvm::cast<Variable>(
7364           legalize(Base, Legal_Reg | Legal_Rematerializable));
7365     }
7366     if (Index) {
7367       // TODO(jpp): perhaps we should only allow Legal_Reg if
7368       // Base->isRematerializable.
7369       RegIndex = llvm::cast<Variable>(
7370           legalize(Index, Legal_Reg | Legal_Rematerializable));
7371     }
7372 
7373     if (Base != RegBase || Index != RegIndex) {
7374       Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
7375                                   Mem->getSegmentRegister());
7376     }
7377 
7378     From = Mem;
7379 
7380     if (!(Allowed & Legal_Mem)) {
7381       From = copyToReg(From, RegNum);
7382     }
7383     return From;
7384   }
7385 
7386   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
7387     if (llvm::isa<ConstantUndef>(Const)) {
7388       From = legalizeUndef(Const, RegNum);
7389       if (isVectorType(Ty))
7390         return From;
7391       Const = llvm::cast<Constant>(From);
7392     }
7393     // There should be no constants of vector type (other than undef).
7394     assert(!isVectorType(Ty));
7395 
7396     if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
7397       if (isScalarFloatingType(Ty)) {
7398         // Convert a scalar floating point constant into an explicit memory
7399         // operand.
7400         if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
7401           if (Utils::isPositiveZero(ConstFloat->getValue()))
7402             return makeZeroedRegister(Ty, RegNum);
7403         } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
7404           if (Utils::isPositiveZero(ConstDouble->getValue()))
7405             return makeZeroedRegister(Ty, RegNum);
7406         }
7407 
7408         auto *CFrom = llvm::cast<Constant>(From);
7409         assert(CFrom->getShouldBePooled());
7410         Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
7411         auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
7412         From = Mem;
7413       }
7414     }
7415 
7416     bool NeedsReg = false;
7417     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
7418       // Immediate specifically not allowed.
7419       NeedsReg = true;
7420     if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
7421       // On x86, FP constants are lowered to mem operands.
7422       NeedsReg = true;
7423     if (NeedsReg) {
7424       From = copyToReg(From, RegNum);
7425     }
7426     return From;
7427   }
7428 
7429   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
7430     // Check if the variable is guaranteed a physical register. This can
7431     // happen either when the variable is pre-colored or when it is assigned
7432     // infinite weight.
7433     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
7434     bool MustRematerialize =
7435         (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
7436     // We need a new physical register for the operand if:
7437     // - Mem is not allowed and Var isn't guaranteed a physical register, or
7438     // - RegNum is required and Var->getRegNum() doesn't match, or
7439     // - Var is a rematerializable variable and rematerializable pass-through
7440     // is
7441     //   not allowed (in which case we need a lea instruction).
7442     if (MustRematerialize) {
7443       Variable *NewVar = makeReg(Ty, RegNum);
7444       // Since Var is rematerializable, the offset will be added when the lea
7445       // is emitted.
7446       constexpr Constant *NoOffset = nullptr;
7447       auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
7448       _lea(NewVar, Mem);
7449       From = NewVar;
7450     } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
7451                (RegNum.hasValue() && RegNum != Var->getRegNum())) {
7452       From = copyToReg(From, RegNum);
7453     }
7454     return From;
7455   }
7456 
7457   llvm::report_fatal_error("Unhandled operand kind in legalize()");
7458   return From;
7459 }
7460 
7461 /// Provide a trivial wrapper to legalize() for this common usage.
7462 
legalizeToReg(Operand * From,RegNumT RegNum)7463 Variable *TargetX8632::legalizeToReg(Operand *From, RegNumT RegNum) {
7464   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
7465 }
7466 
7467 /// Legalize undef values to concrete values.
7468 
legalizeUndef(Operand * From,RegNumT RegNum)7469 Operand *TargetX8632::legalizeUndef(Operand *From, RegNumT RegNum) {
7470   Type Ty = From->getType();
7471   if (llvm::isa<ConstantUndef>(From)) {
7472     // Lower undefs to zero.  Another option is to lower undefs to an
7473     // uninitialized register; however, using an uninitialized register
7474     // results in less predictable code.
7475     //
7476     // If in the future the implementation is changed to lower undef values to
7477     // uninitialized registers, a FakeDef will be needed:
7478     //     Context.insert<InstFakeDef>(Reg);
7479     // This is in order to ensure that the live range of Reg is not
7480     // overestimated.  If the constant being lowered is a 64 bit value, then
7481     // the result should be split and the lo and hi components will need to go
7482     // in uninitialized registers.
7483     if (isVectorType(Ty))
7484       return makeVectorOfZeros(Ty, RegNum);
7485     return Ctx->getConstantZero(Ty);
7486   }
7487   return From;
7488 }
7489 
7490 /// For the cmp instruction, if Src1 is an immediate, or known to be a
7491 /// physical register, we can allow Src0 to be a memory operand. Otherwise,
7492 /// Src0 must be copied into a physical register. (Actually, either Src0 or
7493 /// Src1 can be chosen for the physical register, but unfortunately we have to
7494 /// commit to one or the other before register allocation.)
7495 
legalizeSrc0ForCmp(Operand * Src0,Operand * Src1)7496 Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
7497   bool IsSrc1ImmOrReg = false;
7498   if (llvm::isa<Constant>(Src1)) {
7499     IsSrc1ImmOrReg = true;
7500   } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
7501     if (Var->hasReg())
7502       IsSrc1ImmOrReg = true;
7503   }
7504   return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
7505 }
7506 
formMemoryOperand(Operand * Opnd,Type Ty,bool DoLegalize)7507 X86OperandMem *TargetX8632::formMemoryOperand(Operand *Opnd, Type Ty,
7508                                               bool DoLegalize) {
7509   auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
7510   // It may be the case that address mode optimization already creates an
7511   // X86OperandMem, so in that case it wouldn't need another level of
7512   // transformation.
7513   if (!Mem) {
7514     auto *Base = llvm::dyn_cast<Variable>(Opnd);
7515     auto *Offset = llvm::dyn_cast<Constant>(Opnd);
7516     assert(Base || Offset);
7517     if (Offset) {
7518       if (!llvm::isa<ConstantRelocatable>(Offset)) {
7519         if (llvm::isa<ConstantInteger64>(Offset)) {
7520           // Memory operands cannot have 64-bit immediates, so they must be
7521           // legalized into a register only.
7522           Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
7523           Offset = nullptr;
7524         } else {
7525           Offset = llvm::cast<Constant>(legalize(Offset));
7526 
7527           assert(llvm::isa<ConstantInteger32>(Offset) ||
7528                  llvm::isa<ConstantRelocatable>(Offset));
7529         }
7530       }
7531     }
7532     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
7533   }
7534   return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
7535 }
7536 
makeReg(Type Type,RegNumT RegNum)7537 Variable *TargetX8632::makeReg(Type Type, RegNumT RegNum) {
7538   // There aren't any 64-bit integer registers for x86-32.
7539   assert(Type != IceType_i64);
7540   Variable *Reg = Func->makeVariable(Type);
7541   if (RegNum.hasValue())
7542     Reg->setRegNum(RegNum);
7543   else
7544     Reg->setMustHaveReg();
7545   return Reg;
7546 }
7547 
7548 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
7549                             IceType_v16i8};
7550 
largestTypeInSize(uint32_t Size,uint32_t MaxSize)7551 Type TargetX8632::largestTypeInSize(uint32_t Size, uint32_t MaxSize) {
7552   assert(Size != 0);
7553   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
7554   uint32_t MaxIndex = MaxSize == NoSizeLimit
7555                           ? llvm::array_lengthof(TypeForSize) - 1
7556                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
7557   return TypeForSize[std::min(TyIndex, MaxIndex)];
7558 }
7559 
firstTypeThatFitsSize(uint32_t Size,uint32_t MaxSize)7560 Type TargetX8632::firstTypeThatFitsSize(uint32_t Size, uint32_t MaxSize) {
7561   assert(Size != 0);
7562   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
7563   if (!llvm::isPowerOf2_32(Size))
7564     ++TyIndex;
7565   uint32_t MaxIndex = MaxSize == NoSizeLimit
7566                           ? llvm::array_lengthof(TypeForSize) - 1
7567                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
7568   return TypeForSize[std::min(TyIndex, MaxIndex)];
7569 }
7570 
postLower()7571 void TargetX8632::postLower() {
7572   if (Func->getOptLevel() == Opt_m1)
7573     return;
7574   markRedefinitions();
7575   Context.availabilityUpdate();
7576 }
7577 
emit(const ConstantInteger32 * C) const7578 void TargetX8632::emit(const ConstantInteger32 *C) const {
7579   if (!BuildDefs::dump())
7580     return;
7581   Ostream &Str = Ctx->getStrEmit();
7582   Str << "$" << C->getValue();
7583 }
7584 
emit(const ConstantInteger64 * C) const7585 void TargetX8632::emit(const ConstantInteger64 *C) const {
7586   llvm::report_fatal_error("Not expecting to emit 64-bit integers");
7587 }
7588 
emit(const ConstantFloat * C) const7589 void TargetX8632::emit(const ConstantFloat *C) const {
7590   if (!BuildDefs::dump())
7591     return;
7592   Ostream &Str = Ctx->getStrEmit();
7593   Str << C->getLabelName();
7594 }
7595 
emit(const ConstantDouble * C) const7596 void TargetX8632::emit(const ConstantDouble *C) const {
7597   if (!BuildDefs::dump())
7598     return;
7599   Ostream &Str = Ctx->getStrEmit();
7600   Str << C->getLabelName();
7601 }
7602 
emit(const ConstantUndef *) const7603 void TargetX8632::emit(const ConstantUndef *) const {
7604   llvm::report_fatal_error("undef value encountered by emitter.");
7605 }
7606 
emit(const ConstantRelocatable * C) const7607 void TargetX8632::emit(const ConstantRelocatable *C) const {
7608   if (!BuildDefs::dump())
7609     return;
7610   Ostream &Str = Ctx->getStrEmit();
7611   Str << "$";
7612   emitWithoutPrefix(C);
7613 }
7614 
emitJumpTable(const Cfg *,const InstJumpTable * JumpTable) const7615 void TargetX8632::emitJumpTable(const Cfg *,
7616                                 const InstJumpTable *JumpTable) const {
7617   if (!BuildDefs::dump())
7618     return;
7619   Ostream &Str = Ctx->getStrEmit();
7620   Str << "\t.section\t.rodata." << JumpTable->getSectionName()
7621       << ",\"a\",@progbits\n"
7622          "\t.align\t"
7623       << typeWidthInBytes(IceType_i32) << "\n"
7624       << JumpTable->getName() << ":";
7625 
7626   for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
7627     Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
7628   Str << "\n";
7629 }
7630 
7631 const TargetX8632::TableFcmpType TargetX8632::TableFcmp[] = {
7632 #define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
7633   {dflt, swapS, CondX86::C1, CondX86::C2, swapV, CondX86::pred},
7634     FCMPX8632_TABLE
7635 #undef X
7636 };
7637 
7638 const size_t TargetX8632::TableFcmpSize = llvm::array_lengthof(TableFcmp);
7639 
7640 const TargetX8632::TableIcmp32Type TargetX8632::TableIcmp32[] = {
7641 #define X(val, C_32, C1_64, C2_64, C3_64) {CondX86::C_32},
7642     ICMPX8632_TABLE
7643 #undef X
7644 };
7645 
7646 const size_t TargetX8632::TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
7647 
7648 const TargetX8632::TableIcmp64Type TargetX8632::TableIcmp64[] = {
7649 #define X(val, C_32, C1_64, C2_64, C3_64)                                      \
7650   {CondX86::C1_64, CondX86::C2_64, CondX86::C3_64},
7651     ICMPX8632_TABLE
7652 #undef X
7653 };
7654 
7655 const size_t TargetX8632::TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
7656 
7657 std::array<SmallBitVector, RCX86_NUM> TargetX8632::TypeToRegisterSet = {{}};
7658 
7659 std::array<SmallBitVector, RCX86_NUM> TargetX8632::TypeToRegisterSetUnfiltered =
7660     {{}};
7661 
7662 std::array<SmallBitVector, RegisterSet::Reg_NUM> TargetX8632::RegisterAliases =
7663     {{}};
7664 
7665 template <typename T>
emitConstantPool(GlobalContext * Ctx)7666 void TargetDataX8632::emitConstantPool(GlobalContext *Ctx) {
7667   if (!BuildDefs::dump())
7668     return;
7669   Ostream &Str = Ctx->getStrEmit();
7670   Type Ty = T::Ty;
7671   SizeT Align = typeAlignInBytes(Ty);
7672   ConstantList Pool = Ctx->getConstantPool(Ty);
7673 
7674   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
7675       << "\n";
7676   Str << "\t.align\t" << Align << "\n";
7677 
7678   for (Constant *C : Pool) {
7679     if (!C->getShouldBePooled())
7680       continue;
7681     auto *Const = llvm::cast<typename T::IceType>(C);
7682     typename T::IceType::PrimType Value = Const->getValue();
7683     // Use memcpy() to copy bits from Value into RawValue in a way that avoids
7684     // breaking strict-aliasing rules.
7685     typename T::PrimitiveIntType RawValue;
7686     memcpy(&RawValue, &Value, sizeof(Value));
7687     char buf[30];
7688     int CharsPrinted =
7689         snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
7690     assert(CharsPrinted >= 0);
7691     assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
7692     (void)CharsPrinted; // avoid warnings if asserts are disabled
7693     Str << Const->getLabelName();
7694     Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
7695         << Value << " */\n";
7696   }
7697 }
7698 
lowerConstants()7699 void TargetDataX8632::lowerConstants() {
7700   if (getFlags().getDisableTranslation())
7701     return;
7702   switch (getFlags().getOutFileType()) {
7703   case FT_Elf: {
7704     ELFObjectWriter *Writer = Ctx->getObjectWriter();
7705 
7706     Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
7707     Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
7708     Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
7709 
7710     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
7711     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
7712   } break;
7713   case FT_Asm:
7714   case FT_Iasm: {
7715     OstreamLocker L(Ctx);
7716 
7717     emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
7718     emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
7719     emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
7720 
7721     emitConstantPool<PoolTypeConverter<float>>(Ctx);
7722     emitConstantPool<PoolTypeConverter<double>>(Ctx);
7723   } break;
7724   }
7725 }
7726 
lowerJumpTables()7727 void TargetDataX8632::lowerJumpTables() {
7728   const bool IsPIC = false;
7729   switch (getFlags().getOutFileType()) {
7730   case FT_Elf: {
7731     ELFObjectWriter *Writer = Ctx->getObjectWriter();
7732     const FixupKind RelocationKind = FK_Abs;
7733     for (const JumpTableData &JT : Ctx->getJumpTables())
7734       Writer->writeJumpTable(JT, RelocationKind, IsPIC);
7735   } break;
7736   case FT_Asm:
7737     // Already emitted from Cfg
7738     break;
7739   case FT_Iasm: {
7740     if (!BuildDefs::dump())
7741       return;
7742     Ostream &Str = Ctx->getStrEmit();
7743     const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
7744     for (const JumpTableData &JT : Ctx->getJumpTables()) {
7745       Str << "\t.section\t" << Prefix << JT.getSectionName()
7746           << ",\"a\",@progbits\n"
7747              "\t.align\t"
7748           << typeWidthInBytes(IceType_i32) << "\n"
7749           << JT.getName().toString() << ":";
7750 
7751       // On X8664 ILP32 pointers are 32-bit hence the use of .long
7752       for (intptr_t TargetOffset : JT.getTargetOffsets())
7753         Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
7754       Str << "\n";
7755     }
7756   } break;
7757   }
7758 }
7759 
lowerGlobals(const VariableDeclarationList & Vars,const std::string & SectionSuffix)7760 void TargetDataX8632::lowerGlobals(const VariableDeclarationList &Vars,
7761                                    const std::string &SectionSuffix) {
7762   const bool IsPIC = false;
7763   switch (getFlags().getOutFileType()) {
7764   case FT_Elf: {
7765     ELFObjectWriter *Writer = Ctx->getObjectWriter();
7766     Writer->writeDataSection(Vars, FK_Abs, SectionSuffix, IsPIC);
7767   } break;
7768   case FT_Asm:
7769   case FT_Iasm: {
7770     OstreamLocker L(Ctx);
7771     for (const VariableDeclaration *Var : Vars) {
7772       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
7773         emitGlobal(*Var, SectionSuffix);
7774       }
7775     }
7776   } break;
7777   }
7778 }
7779 
7780 //------------------------------------------------------------------------------
7781 //     __      ______  __     __  ______  ______  __  __   __  ______
7782 //    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
7783 //    \ \ \___\ \ \/\ \ \ \/ ".\ \ \  __\\ \  __<\ \ \ \ \-.  \ \ \__ \
7784 //     \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
7785 //      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
7786 //
7787 //------------------------------------------------------------------------------
_add_sp(Operand * Adjustment)7788 void TargetX8632::_add_sp(Operand *Adjustment) {
7789   Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7790   _add(esp, Adjustment);
7791 }
7792 
_mov_sp(Operand * NewValue)7793 void TargetX8632::_mov_sp(Operand *NewValue) {
7794   Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7795   _redefined(_mov(esp, NewValue));
7796 }
7797 
_sub_sp(Operand * Adjustment)7798 void TargetX8632::_sub_sp(Operand *Adjustment) {
7799   Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7800   _sub(esp, Adjustment);
7801   // Add a fake use of the stack pointer, to prevent the stack pointer
7802   // adustment from being dead-code eliminated in a function that doesn't
7803   // return.
7804   Context.insert<InstFakeUse>(esp);
7805 }
7806 
_link_bp()7807 void TargetX8632::_link_bp() {
7808   Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
7809   Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7810   _push(ebp);
7811   _mov(ebp, esp);
7812   // Keep ebp live for late-stage liveness analysis (e.g. asm-verbose mode).
7813   Context.insert<InstFakeUse>(ebp);
7814 }
7815 
_unlink_bp()7816 void TargetX8632::_unlink_bp() {
7817   Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
7818   Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
7819   // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
7820   // use of esp before the assignment of esp=ebp keeps previous esp
7821   // adjustments from being dead-code eliminated.
7822   Context.insert<InstFakeUse>(esp);
7823   _mov(esp, ebp);
7824   _pop(ebp);
7825 }
7826 
_push_reg(RegNumT RegNum)7827 void TargetX8632::_push_reg(RegNumT RegNum) {
7828   _push(getPhysicalRegister(RegNum, WordType));
7829 }
7830 
_pop_reg(RegNumT RegNum)7831 void TargetX8632::_pop_reg(RegNumT RegNum) {
7832   _pop(getPhysicalRegister(RegNum, WordType));
7833 }
7834 
7835 /// Lower an indirect jump adding sandboxing when needed.
lowerIndirectJump(Variable * JumpTarget)7836 void TargetX8632::lowerIndirectJump(Variable *JumpTarget) { _jmp(JumpTarget); }
7837 
emitCallToTarget(Operand * CallTarget,Variable * ReturnReg,size_t NumVariadicFpArgs)7838 Inst *TargetX8632::emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
7839                                     size_t NumVariadicFpArgs) {
7840   (void)NumVariadicFpArgs;
7841   // Note that NumVariadicFpArgs is only used for System V x86-64 variadic
7842   // calls, because floating point arguments are passed via vector registers,
7843   // whereas for x86-32, all args are passed via the stack.
7844 
7845   return Context.insert<Insts::Call>(ReturnReg, CallTarget);
7846 }
7847 
moveReturnValueToRegister(Operand * Value,Type ReturnType)7848 Variable *TargetX8632::moveReturnValueToRegister(Operand *Value,
7849                                                  Type ReturnType) {
7850   if (isVectorType(ReturnType)) {
7851     return legalizeToReg(Value, RegX8632::Reg_xmm0);
7852   } else if (isScalarFloatingType(ReturnType)) {
7853     _fld(Value);
7854     return nullptr;
7855   } else {
7856     assert(ReturnType == IceType_i32 || ReturnType == IceType_i64);
7857     if (ReturnType == IceType_i64) {
7858       Variable *eax = legalizeToReg(loOperand(Value), RegX8632::Reg_eax);
7859       Variable *edx = legalizeToReg(hiOperand(Value), RegX8632::Reg_edx);
7860       Context.insert<InstFakeUse>(edx);
7861       return eax;
7862     } else {
7863       Variable *Reg = nullptr;
7864       _mov(Reg, Value, RegX8632::Reg_eax);
7865       return Reg;
7866     }
7867   }
7868 }
7869 
emitStackProbe(size_t StackSizeBytes)7870 void TargetX8632::emitStackProbe(size_t StackSizeBytes) {
7871 #if defined(_WIN32)
7872   if (StackSizeBytes >= 4096) {
7873     // _chkstk on Win32 is actually __alloca_probe, which adjusts ESP by the
7874     // stack amount specified in EAX, so we save ESP in ECX, and restore them
7875     // both after the call.
7876 
7877     Variable *EAX = makeReg(IceType_i32, RegX8632::Reg_eax);
7878     Variable *ESP = makeReg(IceType_i32, RegX8632::Reg_esp);
7879     Variable *ECX = makeReg(IceType_i32, RegX8632::Reg_ecx);
7880 
7881     _push_reg(ECX->getRegNum());
7882     _mov(ECX, ESP);
7883 
7884     _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
7885 
7886     auto *CallTarget =
7887         Ctx->getConstantInt32(reinterpret_cast<int32_t>(&_chkstk));
7888     emitCallToTarget(CallTarget, nullptr);
7889 
7890     _mov(ESP, ECX);
7891     _pop_reg(ECX->getRegNum());
7892   }
7893 #endif
7894 }
7895 
7896 // In some cases, there are x-macros tables for both high-level and low-level
7897 // instructions/operands that use the same enum key value. The tables are kept
7898 // separate to maintain a proper separation between abstraction layers. There
7899 // is a risk that the tables could get out of sync if enum values are
7900 // reordered or if entries are added or deleted. The following dummy
7901 // namespaces use static_asserts to ensure everything is kept in sync.
7902 
7903 namespace {
7904 // Validate the enum values in FCMPX8632_TABLE.
7905 namespace dummy1 {
7906 // Define a temporary set of enum values based on low-level table entries.
7907 enum _tmp_enum {
7908 #define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
7909   FCMPX8632_TABLE
7910 #undef X
7911       _num
7912 };
7913 // Define a set of constants based on high-level table entries.
7914 #define X(tag, str) static const int _table1_##tag = InstFcmp::tag;
7915 ICEINSTFCMP_TABLE
7916 #undef X
7917 // Define a set of constants based on low-level table entries, and ensure the
7918 // table entry keys are consistent.
7919 #define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
7920   static const int _table2_##val = _tmp_##val;                                 \
7921   static_assert(                                                               \
7922       _table1_##val == _table2_##val,                                          \
7923       "Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
7924 FCMPX8632_TABLE
7925 #undef X
7926 // Repeat the static asserts with respect to the high-level table entries in
7927 // case the high-level table has extra entries.
7928 #define X(tag, str)                                                            \
7929   static_assert(                                                               \
7930       _table1_##tag == _table2_##tag,                                          \
7931       "Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
7932 ICEINSTFCMP_TABLE
7933 #undef X
7934 } // end of namespace dummy1
7935 
7936 // Validate the enum values in ICMPX8632_TABLE.
7937 namespace dummy2 {
7938 // Define a temporary set of enum values based on low-level table entries.
7939 enum _tmp_enum {
7940 #define X(val, C_32, C1_64, C2_64, C3_64) _tmp_##val,
7941   ICMPX8632_TABLE
7942 #undef X
7943       _num
7944 };
7945 // Define a set of constants based on high-level table entries.
7946 #define X(tag, reverse, str) static const int _table1_##tag = InstIcmp::tag;
7947 ICEINSTICMP_TABLE
7948 #undef X
7949 // Define a set of constants based on low-level table entries, and ensure the
7950 // table entry keys are consistent.
7951 #define X(val, C_32, C1_64, C2_64, C3_64)                                      \
7952   static const int _table2_##val = _tmp_##val;                                 \
7953   static_assert(                                                               \
7954       _table1_##val == _table2_##val,                                          \
7955       "Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
7956 ICMPX8632_TABLE
7957 #undef X
7958 // Repeat the static asserts with respect to the high-level table entries in
7959 // case the high-level table has extra entries.
7960 #define X(tag, reverse, str)                                                   \
7961   static_assert(                                                               \
7962       _table1_##tag == _table2_##tag,                                          \
7963       "Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
7964 ICEINSTICMP_TABLE
7965 #undef X
7966 } // end of namespace dummy2
7967 
7968 // Validate the enum values in ICETYPEX86_TABLE.
7969 namespace dummy3 {
7970 // Define a temporary set of enum values based on low-level table entries.
7971 enum _tmp_enum {
7972 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
7973   _tmp_##tag,
7974   ICETYPEX86_TABLE
7975 #undef X
7976       _num
7977 };
7978 // Define a set of constants based on high-level table entries.
7979 #define X(tag, sizeLog2, align, elts, elty, str, rcstr)                        \
7980   static const int _table1_##tag = IceType_##tag;
7981 ICETYPE_TABLE
7982 #undef X
7983 // Define a set of constants based on low-level table entries, and ensure the
7984 // table entry keys are consistent.
7985 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
7986   static const int _table2_##tag = _tmp_##tag;                                 \
7987   static_assert(_table1_##tag == _table2_##tag,                                \
7988                 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7989 ICETYPEX86_TABLE
7990 #undef X
7991 // Repeat the static asserts with respect to the high-level table entries in
7992 // case the high-level table has extra entries.
7993 #define X(tag, sizeLog2, align, elts, elty, str, rcstr)                        \
7994   static_assert(_table1_##tag == _table2_##tag,                                \
7995                 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7996 ICETYPE_TABLE
7997 #undef X
7998 
7999 } // end of namespace dummy3
8000 } // end of anonymous namespace
8001 
8002 } // end of namespace X8632
8003 } // end of namespace Ice
8004