xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the X86-specific support for the FastISel class. Much
10 // of the target-specific code is generated by tablegen in the file
11 // X86GenFastISel.inc, which is #included here.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "X86.h"
16 #include "X86CallingConv.h"
17 #include "X86InstrBuilder.h"
18 #include "X86InstrInfo.h"
19 #include "X86MachineFunctionInfo.h"
20 #include "X86RegisterInfo.h"
21 #include "X86Subtarget.h"
22 #include "X86TargetMachine.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/CodeGen/FastISel.h"
25 #include "llvm/CodeGen/FunctionLoweringInfo.h"
26 #include "llvm/CodeGen/MachineConstantPool.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DebugInfo.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/GetElementPtrTypeIterator.h"
33 #include "llvm/IR/GlobalVariable.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/IntrinsicsX86.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/Operator.h"
39 #include "llvm/MC/MCAsmInfo.h"
40 #include "llvm/MC/MCSymbol.h"
41 #include "llvm/Support/ErrorHandling.h"
42 #include "llvm/Target/TargetOptions.h"
43 using namespace llvm;
44 
45 namespace {
46 
47 class X86FastISel final : public FastISel {
48   /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
49   /// make the right decision when generating code for different targets.
50   const X86Subtarget *Subtarget;
51 
52 public:
X86FastISel(FunctionLoweringInfo & funcInfo,const TargetLibraryInfo * libInfo)53   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
54                        const TargetLibraryInfo *libInfo)
55       : FastISel(funcInfo, libInfo) {
56     Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
57   }
58 
59   bool fastSelectInstruction(const Instruction *I) override;
60 
61   /// The specified machine instr operand is a vreg, and that
62   /// vreg is being provided by the specified load instruction.  If possible,
63   /// try to fold the load as an operand to the instruction, returning true if
64   /// possible.
65   bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
66                            const LoadInst *LI) override;
67 
68   bool fastLowerArguments() override;
69   bool fastLowerCall(CallLoweringInfo &CLI) override;
70   bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
71 
72 #include "X86GenFastISel.inc"
73 
74 private:
75   bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
76                           const DebugLoc &DL);
77 
78   bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
79                        Register &ResultReg, unsigned Alignment = 1);
80 
81   bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
82                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
83   bool X86FastEmitStore(EVT VT, Register ValReg, X86AddressMode &AM,
84                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
85 
86   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, Register Src, EVT SrcVT,
87                          Register &ResultReg);
88 
89   bool X86SelectAddress(const Value *V, X86AddressMode &AM);
90   bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
91 
92   bool X86SelectLoad(const Instruction *I);
93 
94   bool X86SelectStore(const Instruction *I);
95 
96   bool X86SelectRet(const Instruction *I);
97 
98   bool X86SelectCmp(const Instruction *I);
99 
100   bool X86SelectZExt(const Instruction *I);
101 
102   bool X86SelectSExt(const Instruction *I);
103 
104   bool X86SelectBranch(const Instruction *I);
105 
106   bool X86SelectShift(const Instruction *I);
107 
108   bool X86SelectDivRem(const Instruction *I);
109 
110   bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
111 
112   bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
113 
114   bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
115 
116   bool X86SelectSelect(const Instruction *I);
117 
118   bool X86SelectTrunc(const Instruction *I);
119 
120   bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
121                                const TargetRegisterClass *RC);
122 
123   bool X86SelectFPExt(const Instruction *I);
124   bool X86SelectFPTrunc(const Instruction *I);
125   bool X86SelectSIToFP(const Instruction *I);
126   bool X86SelectUIToFP(const Instruction *I);
127   bool X86SelectIntToFP(const Instruction *I, bool IsSigned);
128   bool X86SelectBitCast(const Instruction *I);
129 
getInstrInfo() const130   const X86InstrInfo *getInstrInfo() const {
131     return Subtarget->getInstrInfo();
132   }
getTargetMachine() const133   const X86TargetMachine *getTargetMachine() const {
134     return static_cast<const X86TargetMachine *>(&TM);
135   }
136 
137   bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
138 
139   Register X86MaterializeInt(const ConstantInt *CI, MVT VT);
140   Register X86MaterializeFP(const ConstantFP *CFP, MVT VT);
141   Register X86MaterializeGV(const GlobalValue *GV, MVT VT);
142   Register fastMaterializeConstant(const Constant *C) override;
143 
144   Register fastMaterializeAlloca(const AllocaInst *C) override;
145 
146   Register fastMaterializeFloatZero(const ConstantFP *CF) override;
147 
148   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
149   /// computed in an SSE register, not on the X87 floating point stack.
isScalarFPTypeInSSEReg(EVT VT) const150   bool isScalarFPTypeInSSEReg(EVT VT) const {
151     return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
152            (VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;
153   }
154 
155   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
156 
157   bool IsMemcpySmall(uint64_t Len);
158 
159   bool TryEmitSmallMemcpy(X86AddressMode DestAM,
160                           X86AddressMode SrcAM, uint64_t Len);
161 
162   bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
163                             const Value *Cond);
164 
165   const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
166                                             X86AddressMode &AM);
167 
168   Register fastEmitInst_rrrr(unsigned MachineInstOpcode,
169                              const TargetRegisterClass *RC, Register Op0,
170                              Register Op1, Register Op2, Register Op3);
171 };
172 
173 } // end anonymous namespace.
174 
175 static std::pair<unsigned, bool>
getX86SSEConditionCode(CmpInst::Predicate Predicate)176 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
177   unsigned CC;
178   bool NeedSwap = false;
179 
180   // SSE Condition code mapping:
181   //  0 - EQ
182   //  1 - LT
183   //  2 - LE
184   //  3 - UNORD
185   //  4 - NEQ
186   //  5 - NLT
187   //  6 - NLE
188   //  7 - ORD
189   switch (Predicate) {
190   default: llvm_unreachable("Unexpected predicate");
191   case CmpInst::FCMP_OEQ: CC = 0;          break;
192   case CmpInst::FCMP_OGT: NeedSwap = true; [[fallthrough]];
193   case CmpInst::FCMP_OLT: CC = 1;          break;
194   case CmpInst::FCMP_OGE: NeedSwap = true; [[fallthrough]];
195   case CmpInst::FCMP_OLE: CC = 2;          break;
196   case CmpInst::FCMP_UNO: CC = 3;          break;
197   case CmpInst::FCMP_UNE: CC = 4;          break;
198   case CmpInst::FCMP_ULE: NeedSwap = true; [[fallthrough]];
199   case CmpInst::FCMP_UGE: CC = 5;          break;
200   case CmpInst::FCMP_ULT: NeedSwap = true; [[fallthrough]];
201   case CmpInst::FCMP_UGT: CC = 6;          break;
202   case CmpInst::FCMP_ORD: CC = 7;          break;
203   case CmpInst::FCMP_UEQ: CC = 8;          break;
204   case CmpInst::FCMP_ONE: CC = 12;         break;
205   }
206 
207   return std::make_pair(CC, NeedSwap);
208 }
209 
210 /// Adds a complex addressing mode to the given machine instr builder.
211 /// Note, this will constrain the index register.  If its not possible to
212 /// constrain the given index register, then a new one will be created.  The
213 /// IndexReg field of the addressing mode will be updated to match in this case.
214 const MachineInstrBuilder &
addFullAddress(const MachineInstrBuilder & MIB,X86AddressMode & AM)215 X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
216                             X86AddressMode &AM) {
217   // First constrain the index register.  It needs to be a GR64_NOSP.
218   AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
219                                          MIB->getNumOperands() +
220                                          X86::AddrIndexReg);
221   return ::addFullAddress(MIB, AM);
222 }
223 
224 /// Check if it is possible to fold the condition from the XALU intrinsic
225 /// into the user. The condition code will only be updated on success.
foldX86XALUIntrinsic(X86::CondCode & CC,const Instruction * I,const Value * Cond)226 bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
227                                        const Value *Cond) {
228   if (!isa<ExtractValueInst>(Cond))
229     return false;
230 
231   const auto *EV = cast<ExtractValueInst>(Cond);
232   if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
233     return false;
234 
235   const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
236   MVT RetVT;
237   const Function *Callee = II->getCalledFunction();
238   Type *RetTy =
239     cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
240   if (!isTypeLegal(RetTy, RetVT))
241     return false;
242 
243   if (RetVT != MVT::i32 && RetVT != MVT::i64)
244     return false;
245 
246   X86::CondCode TmpCC;
247   switch (II->getIntrinsicID()) {
248   default: return false;
249   case Intrinsic::sadd_with_overflow:
250   case Intrinsic::ssub_with_overflow:
251   case Intrinsic::smul_with_overflow:
252   case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
253   case Intrinsic::uadd_with_overflow:
254   case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
255   }
256 
257   // Check if both instructions are in the same basic block.
258   if (II->getParent() != I->getParent())
259     return false;
260 
261   // Make sure nothing is in the way
262   BasicBlock::const_iterator Start(I);
263   BasicBlock::const_iterator End(II);
264   for (auto Itr = std::prev(Start); Itr != End; --Itr) {
265     // We only expect extractvalue instructions between the intrinsic and the
266     // instruction to be selected.
267     if (!isa<ExtractValueInst>(Itr))
268       return false;
269 
270     // Check that the extractvalue operand comes from the intrinsic.
271     const auto *EVI = cast<ExtractValueInst>(Itr);
272     if (EVI->getAggregateOperand() != II)
273       return false;
274   }
275 
276   // Make sure no potentially eflags clobbering phi moves can be inserted in
277   // between.
278   auto HasPhis = [](const BasicBlock *Succ) { return !Succ->phis().empty(); };
279   if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
280     return false;
281 
282   // Make sure there are no potentially eflags clobbering constant
283   // materializations in between.
284   if (llvm::any_of(I->operands(), [](Value *V) { return isa<Constant>(V); }))
285     return false;
286 
287   CC = TmpCC;
288   return true;
289 }
290 
isTypeLegal(Type * Ty,MVT & VT,bool AllowI1)291 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
292   EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);
293   if (evt == MVT::Other || !evt.isSimple())
294     // Unhandled type. Halt "fast" selection and bail.
295     return false;
296 
297   VT = evt.getSimpleVT();
298   // For now, require SSE/SSE2 for performing floating-point operations,
299   // since x87 requires additional work.
300   if (VT == MVT::f64 && !Subtarget->hasSSE2())
301     return false;
302   if (VT == MVT::f32 && !Subtarget->hasSSE1())
303     return false;
304   // Similarly, no f80 support yet.
305   if (VT == MVT::f80)
306     return false;
307   // We only handle legal types. For example, on x86-32 the instruction
308   // selector contains all of the 64-bit instructions from x86-64,
309   // under the assumption that i64 won't be used if the target doesn't
310   // support it.
311   return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
312 }
313 
314 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
315 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
316 /// Return true and the result register by reference if it is possible.
X86FastEmitLoad(MVT VT,X86AddressMode & AM,MachineMemOperand * MMO,Register & ResultReg,unsigned Alignment)317 bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
318                                   MachineMemOperand *MMO, Register &ResultReg,
319                                   unsigned Alignment) {
320   bool HasSSE1 = Subtarget->hasSSE1();
321   bool HasSSE2 = Subtarget->hasSSE2();
322   bool HasSSE41 = Subtarget->hasSSE41();
323   bool HasAVX = Subtarget->hasAVX();
324   bool HasAVX2 = Subtarget->hasAVX2();
325   bool HasAVX512 = Subtarget->hasAVX512();
326   bool HasVLX = Subtarget->hasVLX();
327   bool IsNonTemporal = MMO && MMO->isNonTemporal();
328 
329   // Treat i1 loads the same as i8 loads. Masking will be done when storing.
330   if (VT == MVT::i1)
331     VT = MVT::i8;
332 
333   // Get opcode and regclass of the output for the given load instruction.
334   unsigned Opc = 0;
335   switch (VT.SimpleTy) {
336   default: return false;
337   case MVT::i8:
338     Opc = X86::MOV8rm;
339     break;
340   case MVT::i16:
341     Opc = X86::MOV16rm;
342     break;
343   case MVT::i32:
344     Opc = X86::MOV32rm;
345     break;
346   case MVT::i64:
347     // Must be in x86-64 mode.
348     Opc = X86::MOV64rm;
349     break;
350   case MVT::f32:
351     Opc = HasAVX512 ? X86::VMOVSSZrm_alt
352           : HasAVX  ? X86::VMOVSSrm_alt
353           : HasSSE1 ? X86::MOVSSrm_alt
354                     : X86::LD_Fp32m;
355     break;
356   case MVT::f64:
357     Opc = HasAVX512 ? X86::VMOVSDZrm_alt
358           : HasAVX  ? X86::VMOVSDrm_alt
359           : HasSSE2 ? X86::MOVSDrm_alt
360                     : X86::LD_Fp64m;
361     break;
362   case MVT::f80:
363     // No f80 support yet.
364     return false;
365   case MVT::v4f32:
366     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
367       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
368             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
369     else if (Alignment >= 16)
370       Opc = HasVLX ? X86::VMOVAPSZ128rm :
371             HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
372     else
373       Opc = HasVLX ? X86::VMOVUPSZ128rm :
374             HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
375     break;
376   case MVT::v2f64:
377     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
378       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
379             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
380     else if (Alignment >= 16)
381       Opc = HasVLX ? X86::VMOVAPDZ128rm :
382             HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
383     else
384       Opc = HasVLX ? X86::VMOVUPDZ128rm :
385             HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
386     break;
387   case MVT::v4i32:
388   case MVT::v2i64:
389   case MVT::v8i16:
390   case MVT::v16i8:
391     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
392       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
393             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
394     else if (Alignment >= 16)
395       Opc = HasVLX ? X86::VMOVDQA64Z128rm :
396             HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
397     else
398       Opc = HasVLX ? X86::VMOVDQU64Z128rm :
399             HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
400     break;
401   case MVT::v8f32:
402     assert(HasAVX);
403     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
404       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
405     else if (IsNonTemporal && Alignment >= 16)
406       return false; // Force split for X86::VMOVNTDQArm
407     else if (Alignment >= 32)
408       Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
409     else
410       Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
411     break;
412   case MVT::v4f64:
413     assert(HasAVX);
414     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
415       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
416     else if (IsNonTemporal && Alignment >= 16)
417       return false; // Force split for X86::VMOVNTDQArm
418     else if (Alignment >= 32)
419       Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
420     else
421       Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
422     break;
423   case MVT::v8i32:
424   case MVT::v4i64:
425   case MVT::v16i16:
426   case MVT::v32i8:
427     assert(HasAVX);
428     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
429       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
430     else if (IsNonTemporal && Alignment >= 16)
431       return false; // Force split for X86::VMOVNTDQArm
432     else if (Alignment >= 32)
433       Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
434     else
435       Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
436     break;
437   case MVT::v16f32:
438     assert(HasAVX512);
439     if (IsNonTemporal && Alignment >= 64)
440       Opc = X86::VMOVNTDQAZrm;
441     else
442       Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
443     break;
444   case MVT::v8f64:
445     assert(HasAVX512);
446     if (IsNonTemporal && Alignment >= 64)
447       Opc = X86::VMOVNTDQAZrm;
448     else
449       Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
450     break;
451   case MVT::v8i64:
452   case MVT::v16i32:
453   case MVT::v32i16:
454   case MVT::v64i8:
455     assert(HasAVX512);
456     // Note: There are a lot more choices based on type with AVX-512, but
457     // there's really no advantage when the load isn't masked.
458     if (IsNonTemporal && Alignment >= 64)
459       Opc = X86::VMOVNTDQAZrm;
460     else
461       Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
462     break;
463   }
464 
465   const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
466 
467   ResultReg = createResultReg(RC);
468   MachineInstrBuilder MIB =
469     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);
470   addFullAddress(MIB, AM);
471   if (MMO)
472     MIB->addMemOperand(*FuncInfo.MF, MMO);
473   return true;
474 }
475 
476 /// X86FastEmitStore - Emit a machine instruction to store a value Val of
477 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
478 /// and a displacement offset, or a GlobalAddress,
479 /// i.e. V. Return true if it is possible.
X86FastEmitStore(EVT VT,Register ValReg,X86AddressMode & AM,MachineMemOperand * MMO,bool Aligned)480 bool X86FastISel::X86FastEmitStore(EVT VT, Register ValReg, X86AddressMode &AM,
481                                    MachineMemOperand *MMO, bool Aligned) {
482   bool HasSSE1 = Subtarget->hasSSE1();
483   bool HasSSE2 = Subtarget->hasSSE2();
484   bool HasSSE4A = Subtarget->hasSSE4A();
485   bool HasAVX = Subtarget->hasAVX();
486   bool HasAVX512 = Subtarget->hasAVX512();
487   bool HasVLX = Subtarget->hasVLX();
488   bool IsNonTemporal = MMO && MMO->isNonTemporal();
489 
490   // Get opcode and regclass of the output for the given store instruction.
491   unsigned Opc = 0;
492   switch (VT.getSimpleVT().SimpleTy) {
493   case MVT::f80: // No f80 support yet.
494   default: return false;
495   case MVT::i1: {
496     // Mask out all but lowest bit.
497     Register AndResult = createResultReg(&X86::GR8RegClass);
498     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
499             TII.get(X86::AND8ri), AndResult)
500       .addReg(ValReg).addImm(1);
501     ValReg = AndResult;
502     [[fallthrough]]; // handle i1 as i8.
503   }
504   case MVT::i8:  Opc = X86::MOV8mr;  break;
505   case MVT::i16: Opc = X86::MOV16mr; break;
506   case MVT::i32:
507     Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
508     break;
509   case MVT::i64:
510     // Must be in x86-64 mode.
511     Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
512     break;
513   case MVT::f32:
514     if (HasSSE1) {
515       if (IsNonTemporal && HasSSE4A)
516         Opc = X86::MOVNTSS;
517       else
518         Opc = HasAVX512 ? X86::VMOVSSZmr :
519               HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
520     } else
521       Opc = X86::ST_Fp32m;
522     break;
523   case MVT::f64:
524     if (HasSSE2) {
525       if (IsNonTemporal && HasSSE4A)
526         Opc = X86::MOVNTSD;
527       else
528         Opc = HasAVX512 ? X86::VMOVSDZmr :
529               HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
530     } else
531       Opc = X86::ST_Fp64m;
532     break;
533   case MVT::x86mmx:
534     Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
535     break;
536   case MVT::v4f32:
537     if (Aligned) {
538       if (IsNonTemporal)
539         Opc = HasVLX ? X86::VMOVNTPSZ128mr :
540               HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
541       else
542         Opc = HasVLX ? X86::VMOVAPSZ128mr :
543               HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
544     } else
545       Opc = HasVLX ? X86::VMOVUPSZ128mr :
546             HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
547     break;
548   case MVT::v2f64:
549     if (Aligned) {
550       if (IsNonTemporal)
551         Opc = HasVLX ? X86::VMOVNTPDZ128mr :
552               HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
553       else
554         Opc = HasVLX ? X86::VMOVAPDZ128mr :
555               HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
556     } else
557       Opc = HasVLX ? X86::VMOVUPDZ128mr :
558             HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
559     break;
560   case MVT::v4i32:
561   case MVT::v2i64:
562   case MVT::v8i16:
563   case MVT::v16i8:
564     if (Aligned) {
565       if (IsNonTemporal)
566         Opc = HasVLX ? X86::VMOVNTDQZ128mr :
567               HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
568       else
569         Opc = HasVLX ? X86::VMOVDQA64Z128mr :
570               HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
571     } else
572       Opc = HasVLX ? X86::VMOVDQU64Z128mr :
573             HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
574     break;
575   case MVT::v8f32:
576     assert(HasAVX);
577     if (Aligned) {
578       if (IsNonTemporal)
579         Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
580       else
581         Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
582     } else
583       Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
584     break;
585   case MVT::v4f64:
586     assert(HasAVX);
587     if (Aligned) {
588       if (IsNonTemporal)
589         Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
590       else
591         Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
592     } else
593       Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
594     break;
595   case MVT::v8i32:
596   case MVT::v4i64:
597   case MVT::v16i16:
598   case MVT::v32i8:
599     assert(HasAVX);
600     if (Aligned) {
601       if (IsNonTemporal)
602         Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
603       else
604         Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
605     } else
606       Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
607     break;
608   case MVT::v16f32:
609     assert(HasAVX512);
610     if (Aligned)
611       Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
612     else
613       Opc = X86::VMOVUPSZmr;
614     break;
615   case MVT::v8f64:
616     assert(HasAVX512);
617     if (Aligned) {
618       Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
619     } else
620       Opc = X86::VMOVUPDZmr;
621     break;
622   case MVT::v8i64:
623   case MVT::v16i32:
624   case MVT::v32i16:
625   case MVT::v64i8:
626     assert(HasAVX512);
627     // Note: There are a lot more choices based on type with AVX-512, but
628     // there's really no advantage when the store isn't masked.
629     if (Aligned)
630       Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
631     else
632       Opc = X86::VMOVDQU64Zmr;
633     break;
634   }
635 
636   const MCInstrDesc &Desc = TII.get(Opc);
637   // Some of the instructions in the previous switch use FR128 instead
638   // of FR32 for ValReg. Make sure the register we feed the instruction
639   // matches its register class constraints.
640   // Note: This is fine to do a copy from FR32 to FR128, this is the
641   // same registers behind the scene and actually why it did not trigger
642   // any bugs before.
643   ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
644   MachineInstrBuilder MIB =
645       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, Desc);
646   addFullAddress(MIB, AM).addReg(ValReg);
647   if (MMO)
648     MIB->addMemOperand(*FuncInfo.MF, MMO);
649 
650   return true;
651 }
652 
X86FastEmitStore(EVT VT,const Value * Val,X86AddressMode & AM,MachineMemOperand * MMO,bool Aligned)653 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
654                                    X86AddressMode &AM,
655                                    MachineMemOperand *MMO, bool Aligned) {
656   // Handle 'null' like i32/i64 0.
657   if (isa<ConstantPointerNull>(Val))
658     Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
659 
660   // If this is a store of a simple constant, fold the constant into the store.
661   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
662     unsigned Opc = 0;
663     bool Signed = true;
664     switch (VT.getSimpleVT().SimpleTy) {
665     default: break;
666     case MVT::i1:
667       Signed = false;
668       [[fallthrough]]; // Handle as i8.
669     case MVT::i8:  Opc = X86::MOV8mi;  break;
670     case MVT::i16: Opc = X86::MOV16mi; break;
671     case MVT::i32: Opc = X86::MOV32mi; break;
672     case MVT::i64:
673       // Must be a 32-bit sign extended value.
674       if (isInt<32>(CI->getSExtValue()))
675         Opc = X86::MOV64mi32;
676       break;
677     }
678 
679     if (Opc) {
680       MachineInstrBuilder MIB =
681         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc));
682       addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
683                                             : CI->getZExtValue());
684       if (MMO)
685         MIB->addMemOperand(*FuncInfo.MF, MMO);
686       return true;
687     }
688   }
689 
690   Register ValReg = getRegForValue(Val);
691   if (!ValReg)
692     return false;
693 
694   return X86FastEmitStore(VT, ValReg, AM, MMO, Aligned);
695 }
696 
697 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
698 /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
699 /// ISD::SIGN_EXTEND).
X86FastEmitExtend(ISD::NodeType Opc,EVT DstVT,Register Src,EVT SrcVT,Register & ResultReg)700 bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, Register Src,
701                                     EVT SrcVT, Register &ResultReg) {
702   Register RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
703   if (!RR)
704     return false;
705 
706   ResultReg = RR;
707   return true;
708 }
709 
handleConstantAddresses(const Value * V,X86AddressMode & AM)710 bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
711   // Handle constant address.
712   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
713     // Can't handle alternate code models yet.
714     if (TM.getCodeModel() != CodeModel::Small &&
715         TM.getCodeModel() != CodeModel::Medium)
716       return false;
717 
718     // Can't handle large objects yet.
719     if (TM.isLargeGlobalValue(GV))
720       return false;
721 
722     // Can't handle TLS yet.
723     if (GV->isThreadLocal())
724       return false;
725 
726     // Can't handle !absolute_symbol references yet.
727     if (GV->isAbsoluteSymbolRef())
728       return false;
729 
730     // RIP-relative addresses can't have additional register operands, so if
731     // we've already folded stuff into the addressing mode, just force the
732     // global value into its own register, which we can use as the basereg.
733     if (!Subtarget->isPICStyleRIPRel() ||
734         (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
735       // Okay, we've committed to selecting this global. Set up the address.
736       AM.GV = GV;
737 
738       // Allow the subtarget to classify the global.
739       unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
740 
741       // If this reference is relative to the pic base, set it now.
742       if (isGlobalRelativeToPICBase(GVFlags)) {
743         // FIXME: How do we know Base.Reg is free??
744         AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
745       }
746 
747       // Unless the ABI requires an extra load, return a direct reference to
748       // the global.
749       if (!isGlobalStubReference(GVFlags)) {
750         if (Subtarget->isPICStyleRIPRel()) {
751           // Use rip-relative addressing if we can.  Above we verified that the
752           // base and index registers are unused.
753           assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
754           AM.Base.Reg = X86::RIP;
755         }
756         AM.GVOpFlags = GVFlags;
757         return true;
758       }
759 
760       // Ok, we need to do a load from a stub.  If we've already loaded from
761       // this stub, reuse the loaded pointer, otherwise emit the load now.
762       DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V);
763       Register LoadReg;
764       if (I != LocalValueMap.end() && I->second) {
765         LoadReg = I->second;
766       } else {
767         // Issue load from stub.
768         unsigned Opc = 0;
769         const TargetRegisterClass *RC = nullptr;
770         X86AddressMode StubAM;
771         StubAM.Base.Reg = AM.Base.Reg;
772         StubAM.GV = GV;
773         StubAM.GVOpFlags = GVFlags;
774 
775         // Prepare for inserting code in the local-value area.
776         SavePoint SaveInsertPt = enterLocalValueArea();
777 
778         if (TLI.getPointerTy(DL) == MVT::i64) {
779           Opc = X86::MOV64rm;
780           RC  = &X86::GR64RegClass;
781         } else {
782           Opc = X86::MOV32rm;
783           RC  = &X86::GR32RegClass;
784         }
785 
786         if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL ||
787             GVFlags == X86II::MO_GOTPCREL_NORELAX)
788           StubAM.Base.Reg = X86::RIP;
789 
790         LoadReg = createResultReg(RC);
791         MachineInstrBuilder LoadMI =
792           BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), LoadReg);
793         addFullAddress(LoadMI, StubAM);
794 
795         // Ok, back to normal mode.
796         leaveLocalValueArea(SaveInsertPt);
797 
798         // Prevent loading GV stub multiple times in same MBB.
799         LocalValueMap[V] = LoadReg;
800       }
801 
802       // Now construct the final address. Note that the Disp, Scale,
803       // and Index values may already be set here.
804       AM.Base.Reg = LoadReg;
805       AM.GV = nullptr;
806       return true;
807     }
808   }
809 
810   // If all else fails, try to materialize the value in a register.
811   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
812     if (AM.Base.Reg == 0) {
813       AM.Base.Reg = getRegForValue(V);
814       return AM.Base.Reg != 0;
815     }
816     if (AM.IndexReg == 0) {
817       assert(AM.Scale == 1 && "Scale with no index!");
818       AM.IndexReg = getRegForValue(V);
819       return AM.IndexReg != 0;
820     }
821   }
822 
823   return false;
824 }
825 
826 /// X86SelectAddress - Attempt to fill in an address from the given value.
827 ///
X86SelectAddress(const Value * V,X86AddressMode & AM)828 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
829   SmallVector<const Value *, 32> GEPs;
830 redo_gep:
831   const User *U = nullptr;
832   unsigned Opcode = Instruction::UserOp1;
833   if (const Instruction *I = dyn_cast<Instruction>(V)) {
834     // Don't walk into other basic blocks; it's possible we haven't
835     // visited them yet, so the instructions may not yet be assigned
836     // virtual registers.
837     if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
838         FuncInfo.getMBB(I->getParent()) == FuncInfo.MBB) {
839       Opcode = I->getOpcode();
840       U = I;
841     }
842   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
843     Opcode = C->getOpcode();
844     U = C;
845   }
846 
847   if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
848     if (Ty->getAddressSpace() > 255)
849       // Fast instruction selection doesn't support the special
850       // address spaces.
851       return false;
852 
853   switch (Opcode) {
854   default: break;
855   case Instruction::BitCast:
856     // Look past bitcasts.
857     return X86SelectAddress(U->getOperand(0), AM);
858 
859   case Instruction::IntToPtr:
860     // Look past no-op inttoptrs.
861     if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
862         TLI.getPointerTy(DL))
863       return X86SelectAddress(U->getOperand(0), AM);
864     break;
865 
866   case Instruction::PtrToInt:
867     // Look past no-op ptrtoints.
868     if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
869       return X86SelectAddress(U->getOperand(0), AM);
870     break;
871 
872   case Instruction::Alloca: {
873     // Do static allocas.
874     const AllocaInst *A = cast<AllocaInst>(V);
875     DenseMap<const AllocaInst *, int>::iterator SI =
876       FuncInfo.StaticAllocaMap.find(A);
877     if (SI != FuncInfo.StaticAllocaMap.end()) {
878       AM.BaseType = X86AddressMode::FrameIndexBase;
879       AM.Base.FrameIndex = SI->second;
880       return true;
881     }
882     break;
883   }
884 
885   case Instruction::Add: {
886     // Adds of constants are common and easy enough.
887     if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
888       uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
889       // They have to fit in the 32-bit signed displacement field though.
890       if (isInt<32>(Disp)) {
891         AM.Disp = (uint32_t)Disp;
892         return X86SelectAddress(U->getOperand(0), AM);
893       }
894     }
895     break;
896   }
897 
898   case Instruction::GetElementPtr: {
899     X86AddressMode SavedAM = AM;
900 
901     // Pattern-match simple GEPs.
902     uint64_t Disp = (int32_t)AM.Disp;
903     Register IndexReg = AM.IndexReg;
904     unsigned Scale = AM.Scale;
905     MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT();
906 
907     gep_type_iterator GTI = gep_type_begin(U);
908     // Iterate through the indices, folding what we can. Constants can be
909     // folded, and one dynamic index can be handled, if the scale is supported.
910     for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
911          i != e; ++i, ++GTI) {
912       const Value *Op = *i;
913       if (StructType *STy = GTI.getStructTypeOrNull()) {
914         const StructLayout *SL = DL.getStructLayout(STy);
915         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
916         continue;
917       }
918 
919       // A array/variable index is always of the form i*S where S is the
920       // constant scale size.  See if we can push the scale into immediates.
921       uint64_t S = GTI.getSequentialElementStride(DL);
922       for (;;) {
923         if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
924           // Constant-offset addressing.
925           Disp += CI->getSExtValue() * S;
926           break;
927         }
928         if (canFoldAddIntoGEP(U, Op)) {
929           // A compatible add with a constant operand. Fold the constant.
930           ConstantInt *CI =
931             cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
932           Disp += CI->getSExtValue() * S;
933           // Iterate on the other operand.
934           Op = cast<AddOperator>(Op)->getOperand(0);
935           continue;
936         }
937         if (!IndexReg && (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
938             (S == 1 || S == 2 || S == 4 || S == 8)) {
939           // Scaled-index addressing.
940           Scale = S;
941           IndexReg = getRegForGEPIndex(PtrVT, Op);
942           if (!IndexReg)
943             return false;
944           break;
945         }
946         // Unsupported.
947         goto unsupported_gep;
948       }
949     }
950 
951     // Check for displacement overflow.
952     if (!isInt<32>(Disp))
953       break;
954 
955     AM.IndexReg = IndexReg;
956     AM.Scale = Scale;
957     AM.Disp = (uint32_t)Disp;
958     GEPs.push_back(V);
959 
960     if (const GetElementPtrInst *GEP =
961           dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
962       // Ok, the GEP indices were covered by constant-offset and scaled-index
963       // addressing. Update the address state and move on to examining the base.
964       V = GEP;
965       goto redo_gep;
966     } else if (X86SelectAddress(U->getOperand(0), AM)) {
967       return true;
968     }
969 
970     // If we couldn't merge the gep value into this addr mode, revert back to
971     // our address and just match the value instead of completely failing.
972     AM = SavedAM;
973 
974     for (const Value *I : reverse(GEPs))
975       if (handleConstantAddresses(I, AM))
976         return true;
977 
978     return false;
979   unsupported_gep:
980     // Ok, the GEP indices weren't all covered.
981     break;
982   }
983   }
984 
985   return handleConstantAddresses(V, AM);
986 }
987 
988 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
989 ///
X86SelectCallAddress(const Value * V,X86AddressMode & AM)990 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
991   const User *U = nullptr;
992   unsigned Opcode = Instruction::UserOp1;
993   const Instruction *I = dyn_cast<Instruction>(V);
994   // Record if the value is defined in the same basic block.
995   //
996   // This information is crucial to know whether or not folding an
997   // operand is valid.
998   // Indeed, FastISel generates or reuses a virtual register for all
999   // operands of all instructions it selects. Obviously, the definition and
1000   // its uses must use the same virtual register otherwise the produced
1001   // code is incorrect.
1002   // Before instruction selection, FunctionLoweringInfo::set sets the virtual
1003   // registers for values that are alive across basic blocks. This ensures
1004   // that the values are consistently set between across basic block, even
1005   // if different instruction selection mechanisms are used (e.g., a mix of
1006   // SDISel and FastISel).
1007   // For values local to a basic block, the instruction selection process
1008   // generates these virtual registers with whatever method is appropriate
1009   // for its needs. In particular, FastISel and SDISel do not share the way
1010   // local virtual registers are set.
1011   // Therefore, this is impossible (or at least unsafe) to share values
1012   // between basic blocks unless they use the same instruction selection
1013   // method, which is not guarantee for X86.
1014   // Moreover, things like hasOneUse could not be used accurately, if we
1015   // allow to reference values across basic blocks whereas they are not
1016   // alive across basic blocks initially.
1017   bool InMBB = true;
1018   if (I) {
1019     Opcode = I->getOpcode();
1020     U = I;
1021     InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
1022   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
1023     Opcode = C->getOpcode();
1024     U = C;
1025   }
1026 
1027   switch (Opcode) {
1028   default: break;
1029   case Instruction::BitCast:
1030     // Look past bitcasts if its operand is in the same BB.
1031     if (InMBB)
1032       return X86SelectCallAddress(U->getOperand(0), AM);
1033     break;
1034 
1035   case Instruction::IntToPtr:
1036     // Look past no-op inttoptrs if its operand is in the same BB.
1037     if (InMBB &&
1038         TLI.getValueType(DL, U->getOperand(0)->getType()) ==
1039             TLI.getPointerTy(DL))
1040       return X86SelectCallAddress(U->getOperand(0), AM);
1041     break;
1042 
1043   case Instruction::PtrToInt:
1044     // Look past no-op ptrtoints if its operand is in the same BB.
1045     if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
1046       return X86SelectCallAddress(U->getOperand(0), AM);
1047     break;
1048   }
1049 
1050   // Handle constant address.
1051   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
1052     // Can't handle alternate code models yet.
1053     if (TM.getCodeModel() != CodeModel::Small &&
1054         TM.getCodeModel() != CodeModel::Medium)
1055       return false;
1056 
1057     // RIP-relative addresses can't have additional register operands.
1058     if (Subtarget->isPICStyleRIPRel() &&
1059         (AM.Base.Reg != 0 || AM.IndexReg != 0))
1060       return false;
1061 
1062     // Can't handle TLS.
1063     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
1064       if (GVar->isThreadLocal())
1065         return false;
1066 
1067     // Okay, we've committed to selecting this global. Set up the basic address.
1068     AM.GV = GV;
1069 
1070     // Return a direct reference to the global. Fastisel can handle calls to
1071     // functions that require loads, such as dllimport and nonlazybind
1072     // functions.
1073     if (Subtarget->isPICStyleRIPRel()) {
1074       // Use rip-relative addressing if we can.  Above we verified that the
1075       // base and index registers are unused.
1076       assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
1077       AM.Base.Reg = X86::RIP;
1078     } else {
1079       AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
1080     }
1081 
1082     return true;
1083   }
1084 
1085   // If all else fails, try to materialize the value in a register.
1086   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
1087     auto GetCallRegForValue = [this](const Value *V) {
1088       Register Reg = getRegForValue(V);
1089 
1090       // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
1091       if (Reg && Subtarget->isTarget64BitILP32()) {
1092         Register CopyReg = createResultReg(&X86::GR32RegClass);
1093         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32rr),
1094                 CopyReg)
1095             .addReg(Reg);
1096 
1097         Register ExtReg = createResultReg(&X86::GR64RegClass);
1098         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1099                 TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
1100             .addImm(0)
1101             .addReg(CopyReg)
1102             .addImm(X86::sub_32bit);
1103         Reg = ExtReg;
1104       }
1105 
1106       return Reg;
1107     };
1108 
1109     if (AM.Base.Reg == 0) {
1110       AM.Base.Reg = GetCallRegForValue(V);
1111       return AM.Base.Reg != 0;
1112     }
1113     if (AM.IndexReg == 0) {
1114       assert(AM.Scale == 1 && "Scale with no index!");
1115       AM.IndexReg = GetCallRegForValue(V);
1116       return AM.IndexReg != 0;
1117     }
1118   }
1119 
1120   return false;
1121 }
1122 
1123 
1124 /// X86SelectStore - Select and emit code to implement store instructions.
X86SelectStore(const Instruction * I)1125 bool X86FastISel::X86SelectStore(const Instruction *I) {
1126   // Atomic stores need special handling.
1127   const StoreInst *S = cast<StoreInst>(I);
1128 
1129   if (S->isAtomic())
1130     return false;
1131 
1132   const Value *PtrV = I->getOperand(1);
1133   if (TLI.supportSwiftError()) {
1134     // Swifterror values can come from either a function parameter with
1135     // swifterror attribute or an alloca with swifterror attribute.
1136     if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
1137       if (Arg->hasSwiftErrorAttr())
1138         return false;
1139     }
1140 
1141     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
1142       if (Alloca->isSwiftError())
1143         return false;
1144     }
1145   }
1146 
1147   const Value *Val = S->getValueOperand();
1148   const Value *Ptr = S->getPointerOperand();
1149 
1150   MVT VT;
1151   if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
1152     return false;
1153 
1154   Align Alignment = S->getAlign();
1155   Align ABIAlignment = DL.getABITypeAlign(Val->getType());
1156   bool Aligned = Alignment >= ABIAlignment;
1157 
1158   X86AddressMode AM;
1159   if (!X86SelectAddress(Ptr, AM))
1160     return false;
1161 
1162   return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
1163 }
1164 
1165 /// X86SelectRet - Select and emit code to implement ret instructions.
X86SelectRet(const Instruction * I)1166 bool X86FastISel::X86SelectRet(const Instruction *I) {
1167   const ReturnInst *Ret = cast<ReturnInst>(I);
1168   const Function &F = *I->getParent()->getParent();
1169   const X86MachineFunctionInfo *X86MFInfo =
1170       FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
1171 
1172   if (!FuncInfo.CanLowerReturn)
1173     return false;
1174 
1175   if (TLI.supportSwiftError() &&
1176       F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
1177     return false;
1178 
1179   if (TLI.supportSplitCSR(FuncInfo.MF))
1180     return false;
1181 
1182   CallingConv::ID CC = F.getCallingConv();
1183   if (CC != CallingConv::C &&
1184       CC != CallingConv::Fast &&
1185       CC != CallingConv::Tail &&
1186       CC != CallingConv::SwiftTail &&
1187       CC != CallingConv::X86_FastCall &&
1188       CC != CallingConv::X86_StdCall &&
1189       CC != CallingConv::X86_ThisCall &&
1190       CC != CallingConv::X86_64_SysV &&
1191       CC != CallingConv::Win64)
1192     return false;
1193 
1194   // Don't handle popping bytes if they don't fit the ret's immediate.
1195   if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
1196     return false;
1197 
1198   // fastcc with -tailcallopt is intended to provide a guaranteed
1199   // tail call optimization. Fastisel doesn't know how to do that.
1200   if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
1201       CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
1202     return false;
1203 
1204   // Let SDISel handle vararg functions.
1205   if (F.isVarArg())
1206     return false;
1207 
1208   // Build a list of return value registers.
1209   SmallVector<Register, 4> RetRegs;
1210 
1211   if (Ret->getNumOperands() > 0) {
1212     SmallVector<ISD::OutputArg, 4> Outs;
1213     GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
1214 
1215     // Analyze operands of the call, assigning locations to each operand.
1216     SmallVector<CCValAssign, 16> ValLocs;
1217     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
1218     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1219 
1220     const Value *RV = Ret->getOperand(0);
1221     Register Reg = getRegForValue(RV);
1222     if (!Reg)
1223       return false;
1224 
1225     // Only handle a single return value for now.
1226     if (ValLocs.size() != 1)
1227       return false;
1228 
1229     CCValAssign &VA = ValLocs[0];
1230 
1231     // Don't bother handling odd stuff for now.
1232     if (VA.getLocInfo() != CCValAssign::Full)
1233       return false;
1234     // Only handle register returns for now.
1235     if (!VA.isRegLoc())
1236       return false;
1237 
1238     // The calling-convention tables for x87 returns don't tell
1239     // the whole story.
1240     if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
1241       return false;
1242 
1243     Register SrcReg = Reg + VA.getValNo();
1244     EVT SrcVT = TLI.getValueType(DL, RV->getType());
1245     EVT DstVT = VA.getValVT();
1246     // Special handling for extended integers.
1247     if (SrcVT != DstVT) {
1248       if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
1249         return false;
1250 
1251       if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
1252         return false;
1253 
1254       if (SrcVT == MVT::i1) {
1255         if (Outs[0].Flags.isSExt())
1256           return false;
1257         SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg);
1258         SrcVT = MVT::i8;
1259       }
1260       if (SrcVT != DstVT) {
1261         unsigned Op =
1262             Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
1263         SrcReg =
1264             fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg);
1265       }
1266     }
1267 
1268     // Make the copy.
1269     Register DstReg = VA.getLocReg();
1270     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
1271     // Avoid a cross-class copy. This is very unlikely.
1272     if (!SrcRC->contains(DstReg))
1273       return false;
1274     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1275             TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
1276 
1277     // Add register to return instruction.
1278     RetRegs.push_back(VA.getLocReg());
1279   }
1280 
1281   // Swift calling convention does not require we copy the sret argument
1282   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
1283 
1284   // All x86 ABIs require that for returning structs by value we copy
1285   // the sret argument into %rax/%eax (depending on ABI) for the return.
1286   // We saved the argument into a virtual register in the entry block,
1287   // so now we copy the value out and into %rax/%eax.
1288   if (F.hasStructRetAttr() && CC != CallingConv::Swift &&
1289       CC != CallingConv::SwiftTail) {
1290     Register Reg = X86MFInfo->getSRetReturnReg();
1291     assert(Reg &&
1292            "SRetReturnReg should have been set in LowerFormalArguments()!");
1293     Register RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
1294     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1295             TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
1296     RetRegs.push_back(RetReg);
1297   }
1298 
1299   // Now emit the RET.
1300   MachineInstrBuilder MIB;
1301   if (X86MFInfo->getBytesToPopOnReturn()) {
1302     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1303                   TII.get(Subtarget->is64Bit() ? X86::RETI64 : X86::RETI32))
1304               .addImm(X86MFInfo->getBytesToPopOnReturn());
1305   } else {
1306     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1307                   TII.get(Subtarget->is64Bit() ? X86::RET64 : X86::RET32));
1308   }
1309   for (Register Reg : RetRegs)
1310     MIB.addReg(Reg, RegState::Implicit);
1311   return true;
1312 }
1313 
1314 /// X86SelectLoad - Select and emit code to implement load instructions.
1315 ///
X86SelectLoad(const Instruction * I)1316 bool X86FastISel::X86SelectLoad(const Instruction *I) {
1317   const LoadInst *LI = cast<LoadInst>(I);
1318 
1319   // Atomic loads need special handling.
1320   if (LI->isAtomic())
1321     return false;
1322 
1323   const Value *SV = I->getOperand(0);
1324   if (TLI.supportSwiftError()) {
1325     // Swifterror values can come from either a function parameter with
1326     // swifterror attribute or an alloca with swifterror attribute.
1327     if (const Argument *Arg = dyn_cast<Argument>(SV)) {
1328       if (Arg->hasSwiftErrorAttr())
1329         return false;
1330     }
1331 
1332     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
1333       if (Alloca->isSwiftError())
1334         return false;
1335     }
1336   }
1337 
1338   MVT VT;
1339   if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
1340     return false;
1341 
1342   const Value *Ptr = LI->getPointerOperand();
1343 
1344   X86AddressMode AM;
1345   if (!X86SelectAddress(Ptr, AM))
1346     return false;
1347 
1348   Register ResultReg;
1349   if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
1350                        LI->getAlign().value()))
1351     return false;
1352 
1353   updateValueMap(I, ResultReg);
1354   return true;
1355 }
1356 
X86ChooseCmpOpcode(EVT VT,const X86Subtarget * Subtarget)1357 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
1358   bool HasAVX512 = Subtarget->hasAVX512();
1359   bool HasAVX = Subtarget->hasAVX();
1360   bool HasSSE1 = Subtarget->hasSSE1();
1361   bool HasSSE2 = Subtarget->hasSSE2();
1362 
1363   switch (VT.getSimpleVT().SimpleTy) {
1364   default:       return 0;
1365   case MVT::i8:  return X86::CMP8rr;
1366   case MVT::i16: return X86::CMP16rr;
1367   case MVT::i32: return X86::CMP32rr;
1368   case MVT::i64: return X86::CMP64rr;
1369   case MVT::f32:
1370     return HasAVX512 ? X86::VUCOMISSZrr
1371            : HasAVX  ? X86::VUCOMISSrr
1372            : HasSSE1 ? X86::UCOMISSrr
1373                      : 0;
1374   case MVT::f64:
1375     return HasAVX512 ? X86::VUCOMISDZrr
1376            : HasAVX  ? X86::VUCOMISDrr
1377            : HasSSE2 ? X86::UCOMISDrr
1378                      : 0;
1379   }
1380 }
1381 
1382 /// If we have a comparison with RHS as the RHS  of the comparison, return an
1383 /// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
X86ChooseCmpImmediateOpcode(EVT VT,const ConstantInt * RHSC)1384 static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
1385   switch (VT.getSimpleVT().SimpleTy) {
1386   // Otherwise, we can't fold the immediate into this comparison.
1387   default:
1388     return 0;
1389   case MVT::i8:
1390     return X86::CMP8ri;
1391   case MVT::i16:
1392     return X86::CMP16ri;
1393   case MVT::i32:
1394     return X86::CMP32ri;
1395   case MVT::i64:
1396     // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
1397     // field.
1398     return isInt<32>(RHSC->getSExtValue()) ? X86::CMP64ri32 : 0;
1399   }
1400 }
1401 
X86FastEmitCompare(const Value * Op0,const Value * Op1,EVT VT,const DebugLoc & CurMIMD)1402 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
1403                                      const DebugLoc &CurMIMD) {
1404   Register Op0Reg = getRegForValue(Op0);
1405   if (!Op0Reg)
1406     return false;
1407 
1408   // Handle 'null' like i32/i64 0.
1409   if (isa<ConstantPointerNull>(Op1))
1410     Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
1411 
1412   // We have two options: compare with register or immediate.  If the RHS of
1413   // the compare is an immediate that we can fold into this compare, use
1414   // CMPri, otherwise use CMPrr.
1415   if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
1416     if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
1417       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareImmOpc))
1418         .addReg(Op0Reg)
1419         .addImm(Op1C->getSExtValue());
1420       return true;
1421     }
1422   }
1423 
1424   unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
1425   if (CompareOpc == 0) return false;
1426 
1427   Register Op1Reg = getRegForValue(Op1);
1428   if (!Op1Reg)
1429     return false;
1430   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareOpc))
1431     .addReg(Op0Reg)
1432     .addReg(Op1Reg);
1433 
1434   return true;
1435 }
1436 
X86SelectCmp(const Instruction * I)1437 bool X86FastISel::X86SelectCmp(const Instruction *I) {
1438   const CmpInst *CI = cast<CmpInst>(I);
1439 
1440   MVT VT;
1441   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
1442     return false;
1443 
1444   // Below code only works for scalars.
1445   if (VT.isVector())
1446     return false;
1447 
1448   // Try to optimize or fold the cmp.
1449   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1450   Register ResultReg;
1451   switch (Predicate) {
1452   default: break;
1453   case CmpInst::FCMP_FALSE: {
1454     ResultReg = createResultReg(&X86::GR32RegClass);
1455     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32r0),
1456             ResultReg);
1457     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, X86::sub_8bit);
1458     if (!ResultReg)
1459       return false;
1460     break;
1461   }
1462   case CmpInst::FCMP_TRUE: {
1463     ResultReg = createResultReg(&X86::GR8RegClass);
1464     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),
1465             ResultReg).addImm(1);
1466     break;
1467   }
1468   }
1469 
1470   if (ResultReg) {
1471     updateValueMap(I, ResultReg);
1472     return true;
1473   }
1474 
1475   const Value *LHS = CI->getOperand(0);
1476   const Value *RHS = CI->getOperand(1);
1477 
1478   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
1479   // We don't have to materialize a zero constant for this case and can just use
1480   // %x again on the RHS.
1481   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1482     const auto *RHSC = dyn_cast<ConstantFP>(RHS);
1483     if (RHSC && RHSC->isNullValue())
1484       RHS = LHS;
1485   }
1486 
1487   // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
1488   static const uint16_t SETFOpcTable[2][3] = {
1489     { X86::COND_E,  X86::COND_NP, X86::AND8rr },
1490     { X86::COND_NE, X86::COND_P,  X86::OR8rr  }
1491   };
1492   const uint16_t *SETFOpc = nullptr;
1493   switch (Predicate) {
1494   default: break;
1495   case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
1496   case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
1497   }
1498 
1499   ResultReg = createResultReg(&X86::GR8RegClass);
1500   if (SETFOpc) {
1501     if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1502       return false;
1503 
1504     Register FlagReg1 = createResultReg(&X86::GR8RegClass);
1505     Register FlagReg2 = createResultReg(&X86::GR8RegClass);
1506     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1507             FlagReg1).addImm(SETFOpc[0]);
1508     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1509             FlagReg2).addImm(SETFOpc[1]);
1510     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(SETFOpc[2]),
1511             ResultReg).addReg(FlagReg1).addReg(FlagReg2);
1512     updateValueMap(I, ResultReg);
1513     return true;
1514   }
1515 
1516   X86::CondCode CC;
1517   bool SwapArgs;
1518   std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
1519   assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1520 
1521   if (SwapArgs)
1522     std::swap(LHS, RHS);
1523 
1524   // Emit a compare of LHS/RHS.
1525   if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1526     return false;
1527 
1528   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1529           ResultReg).addImm(CC);
1530   updateValueMap(I, ResultReg);
1531   return true;
1532 }
1533 
X86SelectZExt(const Instruction * I)1534 bool X86FastISel::X86SelectZExt(const Instruction *I) {
1535   EVT DstVT = TLI.getValueType(DL, I->getType());
1536   if (!TLI.isTypeLegal(DstVT))
1537     return false;
1538 
1539   Register ResultReg = getRegForValue(I->getOperand(0));
1540   if (!ResultReg)
1541     return false;
1542 
1543   // Handle zero-extension from i1 to i8, which is common.
1544   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
1545   if (SrcVT == MVT::i1) {
1546     // Set the high bits to zero.
1547     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
1548     SrcVT = MVT::i8;
1549 
1550     if (!ResultReg)
1551       return false;
1552   }
1553 
1554   if (DstVT == MVT::i64) {
1555     // Handle extension to 64-bits via sub-register shenanigans.
1556     unsigned MovInst;
1557 
1558     switch (SrcVT.SimpleTy) {
1559     case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
1560     case MVT::i16: MovInst = X86::MOVZX32rr16; break;
1561     case MVT::i32: MovInst = X86::MOV32rr;     break;
1562     default: llvm_unreachable("Unexpected zext to i64 source type");
1563     }
1564 
1565     Register Result32 = createResultReg(&X86::GR32RegClass);
1566     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(MovInst), Result32)
1567       .addReg(ResultReg);
1568 
1569     ResultReg = createResultReg(&X86::GR64RegClass);
1570     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::SUBREG_TO_REG),
1571             ResultReg)
1572       .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
1573   } else if (DstVT == MVT::i16) {
1574     // i8->i16 doesn't exist in the autogenerated isel table. Need to zero
1575     // extend to 32-bits and then extract down to 16-bits.
1576     Register Result32 = createResultReg(&X86::GR32RegClass);
1577     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVZX32rr8),
1578             Result32).addReg(ResultReg);
1579 
1580     ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
1581   } else if (DstVT != MVT::i8) {
1582     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
1583                            ResultReg);
1584     if (!ResultReg)
1585       return false;
1586   }
1587 
1588   updateValueMap(I, ResultReg);
1589   return true;
1590 }
1591 
X86SelectSExt(const Instruction * I)1592 bool X86FastISel::X86SelectSExt(const Instruction *I) {
1593   EVT DstVT = TLI.getValueType(DL, I->getType());
1594   if (!TLI.isTypeLegal(DstVT))
1595     return false;
1596 
1597   Register ResultReg = getRegForValue(I->getOperand(0));
1598   if (!ResultReg)
1599     return false;
1600 
1601   // Handle sign-extension from i1 to i8.
1602   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
1603   if (SrcVT == MVT::i1) {
1604     // Set the high bits to zero.
1605     Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
1606     if (!ZExtReg)
1607       return false;
1608 
1609     // Negate the result to make an 8-bit sign extended value.
1610     ResultReg = createResultReg(&X86::GR8RegClass);
1611     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::NEG8r),
1612             ResultReg).addReg(ZExtReg);
1613 
1614     SrcVT = MVT::i8;
1615   }
1616 
1617   if (DstVT == MVT::i16) {
1618     // i8->i16 doesn't exist in the autogenerated isel table. Need to sign
1619     // extend to 32-bits and then extract down to 16-bits.
1620     Register Result32 = createResultReg(&X86::GR32RegClass);
1621     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVSX32rr8),
1622             Result32).addReg(ResultReg);
1623 
1624     ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
1625   } else if (DstVT != MVT::i8) {
1626     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
1627                            ResultReg);
1628     if (!ResultReg)
1629       return false;
1630   }
1631 
1632   updateValueMap(I, ResultReg);
1633   return true;
1634 }
1635 
X86SelectBranch(const Instruction * I)1636 bool X86FastISel::X86SelectBranch(const Instruction *I) {
1637   // Unconditional branches are selected by tablegen-generated code.
1638   // Handle a conditional branch.
1639   const BranchInst *BI = cast<BranchInst>(I);
1640   MachineBasicBlock *TrueMBB = FuncInfo.getMBB(BI->getSuccessor(0));
1641   MachineBasicBlock *FalseMBB = FuncInfo.getMBB(BI->getSuccessor(1));
1642 
1643   // Fold the common case of a conditional branch with a comparison
1644   // in the same block (values defined on other blocks may not have
1645   // initialized registers).
1646   X86::CondCode CC;
1647   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
1648     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
1649       EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
1650 
1651       // Try to optimize or fold the cmp.
1652       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1653       switch (Predicate) {
1654       default: break;
1655       case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, MIMD.getDL()); return true;
1656       case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, MIMD.getDL()); return true;
1657       }
1658 
1659       const Value *CmpLHS = CI->getOperand(0);
1660       const Value *CmpRHS = CI->getOperand(1);
1661 
1662       // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
1663       // 0.0.
1664       // We don't have to materialize a zero constant for this case and can just
1665       // use %x again on the RHS.
1666       if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1667         const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
1668         if (CmpRHSC && CmpRHSC->isNullValue())
1669           CmpRHS = CmpLHS;
1670       }
1671 
1672       // Try to take advantage of fallthrough opportunities.
1673       if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1674         std::swap(TrueMBB, FalseMBB);
1675         Predicate = CmpInst::getInversePredicate(Predicate);
1676       }
1677 
1678       // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
1679       // code check. Instead two branch instructions are required to check all
1680       // the flags. First we change the predicate to a supported condition code,
1681       // which will be the first branch. Later one we will emit the second
1682       // branch.
1683       bool NeedExtraBranch = false;
1684       switch (Predicate) {
1685       default: break;
1686       case CmpInst::FCMP_OEQ:
1687         std::swap(TrueMBB, FalseMBB);
1688         [[fallthrough]];
1689       case CmpInst::FCMP_UNE:
1690         NeedExtraBranch = true;
1691         Predicate = CmpInst::FCMP_ONE;
1692         break;
1693       }
1694 
1695       bool SwapArgs;
1696       std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
1697       assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1698 
1699       if (SwapArgs)
1700         std::swap(CmpLHS, CmpRHS);
1701 
1702       // Emit a compare of the LHS and RHS, setting the flags.
1703       if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
1704         return false;
1705 
1706       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1707         .addMBB(TrueMBB).addImm(CC);
1708 
1709       // X86 requires a second branch to handle UNE (and OEQ, which is mapped
1710       // to UNE above).
1711       if (NeedExtraBranch) {
1712         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1713           .addMBB(TrueMBB).addImm(X86::COND_P);
1714       }
1715 
1716       finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1717       return true;
1718     }
1719   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
1720     // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
1721     // typically happen for _Bool and C++ bools.
1722     MVT SourceVT;
1723     if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
1724         isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
1725       unsigned TestOpc = 0;
1726       switch (SourceVT.SimpleTy) {
1727       default: break;
1728       case MVT::i8:  TestOpc = X86::TEST8ri; break;
1729       case MVT::i16: TestOpc = X86::TEST16ri; break;
1730       case MVT::i32: TestOpc = X86::TEST32ri; break;
1731       case MVT::i64: TestOpc = X86::TEST64ri32; break;
1732       }
1733       if (TestOpc) {
1734         Register OpReg = getRegForValue(TI->getOperand(0));
1735         if (!OpReg)
1736           return false;
1737 
1738         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TestOpc))
1739           .addReg(OpReg).addImm(1);
1740 
1741         unsigned JmpCond = X86::COND_NE;
1742         if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1743           std::swap(TrueMBB, FalseMBB);
1744           JmpCond = X86::COND_E;
1745         }
1746 
1747         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1748           .addMBB(TrueMBB).addImm(JmpCond);
1749 
1750         finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1751         return true;
1752       }
1753     }
1754   } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
1755     // Fake request the condition, otherwise the intrinsic might be completely
1756     // optimized away.
1757     Register TmpReg = getRegForValue(BI->getCondition());
1758     if (!TmpReg)
1759       return false;
1760 
1761     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1762       .addMBB(TrueMBB).addImm(CC);
1763     finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1764     return true;
1765   }
1766 
1767   // Otherwise do a clumsy setcc and re-test it.
1768   // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
1769   // in an explicit cast, so make sure to handle that correctly.
1770   Register OpReg = getRegForValue(BI->getCondition());
1771   if (!OpReg)
1772     return false;
1773 
1774   // In case OpReg is a K register, COPY to a GPR
1775   if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
1776     Register KOpReg = OpReg;
1777     OpReg = createResultReg(&X86::GR32RegClass);
1778     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1779             TII.get(TargetOpcode::COPY), OpReg)
1780         .addReg(KOpReg);
1781     OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, X86::sub_8bit);
1782   }
1783   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
1784       .addReg(OpReg)
1785       .addImm(1);
1786   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1787     .addMBB(TrueMBB).addImm(X86::COND_NE);
1788   finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1789   return true;
1790 }
1791 
X86SelectShift(const Instruction * I)1792 bool X86FastISel::X86SelectShift(const Instruction *I) {
1793   Register CReg;
1794   unsigned OpReg;
1795   const TargetRegisterClass *RC = nullptr;
1796   if (I->getType()->isIntegerTy(8)) {
1797     CReg = X86::CL;
1798     RC = &X86::GR8RegClass;
1799     switch (I->getOpcode()) {
1800     case Instruction::LShr: OpReg = X86::SHR8rCL; break;
1801     case Instruction::AShr: OpReg = X86::SAR8rCL; break;
1802     case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
1803     default: return false;
1804     }
1805   } else if (I->getType()->isIntegerTy(16)) {
1806     CReg = X86::CX;
1807     RC = &X86::GR16RegClass;
1808     switch (I->getOpcode()) {
1809     default: llvm_unreachable("Unexpected shift opcode");
1810     case Instruction::LShr: OpReg = X86::SHR16rCL; break;
1811     case Instruction::AShr: OpReg = X86::SAR16rCL; break;
1812     case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
1813     }
1814   } else if (I->getType()->isIntegerTy(32)) {
1815     CReg = X86::ECX;
1816     RC = &X86::GR32RegClass;
1817     switch (I->getOpcode()) {
1818     default: llvm_unreachable("Unexpected shift opcode");
1819     case Instruction::LShr: OpReg = X86::SHR32rCL; break;
1820     case Instruction::AShr: OpReg = X86::SAR32rCL; break;
1821     case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
1822     }
1823   } else if (I->getType()->isIntegerTy(64)) {
1824     CReg = X86::RCX;
1825     RC = &X86::GR64RegClass;
1826     switch (I->getOpcode()) {
1827     default: llvm_unreachable("Unexpected shift opcode");
1828     case Instruction::LShr: OpReg = X86::SHR64rCL; break;
1829     case Instruction::AShr: OpReg = X86::SAR64rCL; break;
1830     case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
1831     }
1832   } else {
1833     return false;
1834   }
1835 
1836   MVT VT;
1837   if (!isTypeLegal(I->getType(), VT))
1838     return false;
1839 
1840   Register Op0Reg = getRegForValue(I->getOperand(0));
1841   if (!Op0Reg)
1842     return false;
1843 
1844   Register Op1Reg = getRegForValue(I->getOperand(1));
1845   if (!Op1Reg)
1846     return false;
1847   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
1848           CReg).addReg(Op1Reg);
1849 
1850   // The shift instruction uses X86::CL. If we defined a super-register
1851   // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
1852   if (CReg != X86::CL)
1853     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1854             TII.get(TargetOpcode::KILL), X86::CL)
1855       .addReg(CReg, RegState::Kill);
1856 
1857   Register ResultReg = createResultReg(RC);
1858   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(OpReg), ResultReg)
1859     .addReg(Op0Reg);
1860   updateValueMap(I, ResultReg);
1861   return true;
1862 }
1863 
X86SelectDivRem(const Instruction * I)1864 bool X86FastISel::X86SelectDivRem(const Instruction *I) {
1865   const static unsigned NumTypes = 4; // i8, i16, i32, i64
1866   const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
1867   const static bool S = true;  // IsSigned
1868   const static bool U = false; // !IsSigned
1869   const static unsigned Copy = TargetOpcode::COPY;
1870   // For the X86 DIV/IDIV instruction, in most cases the dividend
1871   // (numerator) must be in a specific register pair highreg:lowreg,
1872   // producing the quotient in lowreg and the remainder in highreg.
1873   // For most data types, to set up the instruction, the dividend is
1874   // copied into lowreg, and lowreg is sign-extended or zero-extended
1875   // into highreg.  The exception is i8, where the dividend is defined
1876   // as a single register rather than a register pair, and we
1877   // therefore directly sign-extend or zero-extend the dividend into
1878   // lowreg, instead of copying, and ignore the highreg.
1879   const static struct DivRemEntry {
1880     // The following portion depends only on the data type.
1881     const TargetRegisterClass *RC;
1882     unsigned LowInReg;  // low part of the register pair
1883     unsigned HighInReg; // high part of the register pair
1884     // The following portion depends on both the data type and the operation.
1885     struct DivRemResult {
1886     unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
1887     unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
1888                               // highreg, or copying a zero into highreg.
1889     unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
1890                               // zero/sign-extending into lowreg for i8.
1891     unsigned DivRemResultReg; // Register containing the desired result.
1892     bool IsOpSigned;          // Whether to use signed or unsigned form.
1893     } ResultTable[NumOps];
1894   } OpTable[NumTypes] = {
1895     { &X86::GR8RegClass,  X86::AX,  0, {
1896         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
1897         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
1898         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
1899         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
1900       }
1901     }, // i8
1902     { &X86::GR16RegClass, X86::AX,  X86::DX, {
1903         { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
1904         { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
1905         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
1906         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
1907       }
1908     }, // i16
1909     { &X86::GR32RegClass, X86::EAX, X86::EDX, {
1910         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
1911         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
1912         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
1913         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
1914       }
1915     }, // i32
1916     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
1917         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
1918         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
1919         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
1920         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
1921       }
1922     }, // i64
1923   };
1924 
1925   MVT VT;
1926   if (!isTypeLegal(I->getType(), VT))
1927     return false;
1928 
1929   unsigned TypeIndex, OpIndex;
1930   switch (VT.SimpleTy) {
1931   default: return false;
1932   case MVT::i8:  TypeIndex = 0; break;
1933   case MVT::i16: TypeIndex = 1; break;
1934   case MVT::i32: TypeIndex = 2; break;
1935   case MVT::i64: TypeIndex = 3;
1936     if (!Subtarget->is64Bit())
1937       return false;
1938     break;
1939   }
1940 
1941   switch (I->getOpcode()) {
1942   default: llvm_unreachable("Unexpected div/rem opcode");
1943   case Instruction::SDiv: OpIndex = 0; break;
1944   case Instruction::SRem: OpIndex = 1; break;
1945   case Instruction::UDiv: OpIndex = 2; break;
1946   case Instruction::URem: OpIndex = 3; break;
1947   }
1948 
1949   const DivRemEntry &TypeEntry = OpTable[TypeIndex];
1950   const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
1951   Register Op0Reg = getRegForValue(I->getOperand(0));
1952   if (!Op0Reg)
1953     return false;
1954   Register Op1Reg = getRegForValue(I->getOperand(1));
1955   if (!Op1Reg)
1956     return false;
1957 
1958   // Move op0 into low-order input register.
1959   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1960           TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
1961   // Zero-extend or sign-extend into high-order input register.
1962   if (OpEntry.OpSignExtend) {
1963     if (OpEntry.IsOpSigned)
1964       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1965               TII.get(OpEntry.OpSignExtend));
1966     else {
1967       Register Zero32 = createResultReg(&X86::GR32RegClass);
1968       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1969               TII.get(X86::MOV32r0), Zero32);
1970 
1971       // Copy the zero into the appropriate sub/super/identical physical
1972       // register. Unfortunately the operations needed are not uniform enough
1973       // to fit neatly into the table above.
1974       if (VT == MVT::i16) {
1975         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1976                 TII.get(Copy), TypeEntry.HighInReg)
1977           .addReg(Zero32, 0, X86::sub_16bit);
1978       } else if (VT == MVT::i32) {
1979         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1980                 TII.get(Copy), TypeEntry.HighInReg)
1981             .addReg(Zero32);
1982       } else if (VT == MVT::i64) {
1983         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1984                 TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
1985             .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
1986       }
1987     }
1988   }
1989   // Generate the DIV/IDIV instruction.
1990   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1991           TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
1992   // For i8 remainder, we can't reference ah directly, as we'll end
1993   // up with bogus copies like %r9b = COPY %ah. Reference ax
1994   // instead to prevent ah references in a rex instruction.
1995   //
1996   // The current assumption of the fast register allocator is that isel
1997   // won't generate explicit references to the GR8_NOREX registers. If
1998   // the allocator and/or the backend get enhanced to be more robust in
1999   // that regard, this can be, and should be, removed.
2000   Register ResultReg;
2001   if ((I->getOpcode() == Instruction::SRem ||
2002        I->getOpcode() == Instruction::URem) &&
2003       OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
2004     Register SourceSuperReg = createResultReg(&X86::GR16RegClass);
2005     Register ResultSuperReg = createResultReg(&X86::GR16RegClass);
2006     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2007             TII.get(Copy), SourceSuperReg).addReg(X86::AX);
2008 
2009     // Shift AX right by 8 bits instead of using AH.
2010     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SHR16ri),
2011             ResultSuperReg).addReg(SourceSuperReg).addImm(8);
2012 
2013     // Now reference the 8-bit subreg of the result.
2014     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
2015                                            X86::sub_8bit);
2016   }
2017   // Copy the result out of the physreg if we haven't already.
2018   if (!ResultReg) {
2019     ResultReg = createResultReg(TypeEntry.RC);
2020     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Copy), ResultReg)
2021         .addReg(OpEntry.DivRemResultReg);
2022   }
2023   updateValueMap(I, ResultReg);
2024 
2025   return true;
2026 }
2027 
2028 /// Emit a conditional move instruction (if the are supported) to lower
2029 /// the select.
X86FastEmitCMoveSelect(MVT RetVT,const Instruction * I)2030 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
2031   // Check if the subtarget supports these instructions.
2032   if (!Subtarget->canUseCMOV())
2033     return false;
2034 
2035   // FIXME: Add support for i8.
2036   if (RetVT < MVT::i16 || RetVT > MVT::i64)
2037     return false;
2038 
2039   const Value *Cond = I->getOperand(0);
2040   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2041   bool NeedTest = true;
2042   X86::CondCode CC = X86::COND_NE;
2043 
2044   // Optimize conditions coming from a compare if both instructions are in the
2045   // same basic block (values defined in other basic blocks may not have
2046   // initialized registers).
2047   const auto *CI = dyn_cast<CmpInst>(Cond);
2048   if (CI && (CI->getParent() == I->getParent())) {
2049     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2050 
2051     // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
2052     static const uint16_t SETFOpcTable[2][3] = {
2053       { X86::COND_NP, X86::COND_E,  X86::TEST8rr },
2054       { X86::COND_P,  X86::COND_NE, X86::OR8rr   }
2055     };
2056     const uint16_t *SETFOpc = nullptr;
2057     switch (Predicate) {
2058     default: break;
2059     case CmpInst::FCMP_OEQ:
2060       SETFOpc = &SETFOpcTable[0][0];
2061       Predicate = CmpInst::ICMP_NE;
2062       break;
2063     case CmpInst::FCMP_UNE:
2064       SETFOpc = &SETFOpcTable[1][0];
2065       Predicate = CmpInst::ICMP_NE;
2066       break;
2067     }
2068 
2069     bool NeedSwap;
2070     std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
2071     assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
2072 
2073     const Value *CmpLHS = CI->getOperand(0);
2074     const Value *CmpRHS = CI->getOperand(1);
2075     if (NeedSwap)
2076       std::swap(CmpLHS, CmpRHS);
2077 
2078     EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
2079     // Emit a compare of the LHS and RHS, setting the flags.
2080     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
2081       return false;
2082 
2083     if (SETFOpc) {
2084       Register FlagReg1 = createResultReg(&X86::GR8RegClass);
2085       Register FlagReg2 = createResultReg(&X86::GR8RegClass);
2086       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2087               FlagReg1).addImm(SETFOpc[0]);
2088       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2089               FlagReg2).addImm(SETFOpc[1]);
2090       auto const &II = TII.get(SETFOpc[2]);
2091       if (II.getNumDefs()) {
2092         Register TmpReg = createResultReg(&X86::GR8RegClass);
2093         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, TmpReg)
2094           .addReg(FlagReg2).addReg(FlagReg1);
2095       } else {
2096         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
2097           .addReg(FlagReg2).addReg(FlagReg1);
2098       }
2099     }
2100     NeedTest = false;
2101   } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
2102     // Fake request the condition, otherwise the intrinsic might be completely
2103     // optimized away.
2104     Register TmpReg = getRegForValue(Cond);
2105     if (!TmpReg)
2106       return false;
2107 
2108     NeedTest = false;
2109   }
2110 
2111   if (NeedTest) {
2112     // Selects operate on i1, however, CondReg is 8 bits width and may contain
2113     // garbage. Indeed, only the less significant bit is supposed to be
2114     // accurate. If we read more than the lsb, we may see non-zero values
2115     // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
2116     // the select. This is achieved by performing TEST against 1.
2117     Register CondReg = getRegForValue(Cond);
2118     if (!CondReg)
2119       return false;
2120 
2121     // In case OpReg is a K register, COPY to a GPR
2122     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
2123       Register KCondReg = CondReg;
2124       CondReg = createResultReg(&X86::GR32RegClass);
2125       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2126               TII.get(TargetOpcode::COPY), CondReg)
2127           .addReg(KCondReg);
2128       CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
2129     }
2130     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
2131         .addReg(CondReg)
2132         .addImm(1);
2133   }
2134 
2135   const Value *LHS = I->getOperand(1);
2136   const Value *RHS = I->getOperand(2);
2137 
2138   Register RHSReg = getRegForValue(RHS);
2139   Register LHSReg = getRegForValue(LHS);
2140   if (!LHSReg || !RHSReg)
2141     return false;
2142 
2143   const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
2144   unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC) / 8, false,
2145                                     Subtarget->hasNDD());
2146   Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
2147   updateValueMap(I, ResultReg);
2148   return true;
2149 }
2150 
2151 /// Emit SSE or AVX instructions to lower the select.
2152 ///
2153 /// Try to use SSE1/SSE2 instructions to simulate a select without branches.
2154 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
2155 /// SSE instructions are available. If AVX is available, try to use a VBLENDV.
X86FastEmitSSESelect(MVT RetVT,const Instruction * I)2156 bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
2157   // Optimize conditions coming from a compare if both instructions are in the
2158   // same basic block (values defined in other basic blocks may not have
2159   // initialized registers).
2160   const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
2161   if (!CI || (CI->getParent() != I->getParent()))
2162     return false;
2163 
2164   if (I->getType() != CI->getOperand(0)->getType() ||
2165       !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
2166         (Subtarget->hasSSE2() && RetVT == MVT::f64)))
2167     return false;
2168 
2169   const Value *CmpLHS = CI->getOperand(0);
2170   const Value *CmpRHS = CI->getOperand(1);
2171   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2172 
2173   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
2174   // We don't have to materialize a zero constant for this case and can just use
2175   // %x again on the RHS.
2176   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
2177     const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
2178     if (CmpRHSC && CmpRHSC->isNullValue())
2179       CmpRHS = CmpLHS;
2180   }
2181 
2182   unsigned CC;
2183   bool NeedSwap;
2184   std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
2185   if (CC > 7 && !Subtarget->hasAVX())
2186     return false;
2187 
2188   if (NeedSwap)
2189     std::swap(CmpLHS, CmpRHS);
2190 
2191   const Value *LHS = I->getOperand(1);
2192   const Value *RHS = I->getOperand(2);
2193 
2194   Register LHSReg = getRegForValue(LHS);
2195   Register RHSReg = getRegForValue(RHS);
2196   Register CmpLHSReg = getRegForValue(CmpLHS);
2197   Register CmpRHSReg = getRegForValue(CmpRHS);
2198   if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)
2199     return false;
2200 
2201   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2202   Register ResultReg;
2203 
2204   if (Subtarget->hasAVX512()) {
2205     // If we have AVX512 we can use a mask compare and masked movss/sd.
2206     const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
2207     const TargetRegisterClass *VK1 = &X86::VK1RegClass;
2208 
2209     unsigned CmpOpcode =
2210       (RetVT == MVT::f32) ? X86::VCMPSSZrri : X86::VCMPSDZrri;
2211     Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpRHSReg,
2212                                        CC);
2213 
2214     // Need an IMPLICIT_DEF for the input that is used to generate the upper
2215     // bits of the result register since its not based on any of the inputs.
2216     Register ImplicitDefReg = createResultReg(VR128X);
2217     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2218             TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2219 
2220     // Place RHSReg is the passthru of the masked movss/sd operation and put
2221     // LHS in the input. The mask input comes from the compare.
2222     unsigned MovOpcode =
2223       (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
2224     Register MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, CmpReg,
2225                                         ImplicitDefReg, LHSReg);
2226 
2227     ResultReg = createResultReg(RC);
2228     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2229             TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);
2230 
2231   } else if (Subtarget->hasAVX()) {
2232     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
2233 
2234     // If we have AVX, create 1 blendv instead of 3 logic instructions.
2235     // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
2236     // uses XMM0 as the selection register. That may need just as many
2237     // instructions as the AND/ANDN/OR sequence due to register moves, so
2238     // don't bother.
2239     unsigned CmpOpcode =
2240       (RetVT == MVT::f32) ? X86::VCMPSSrri : X86::VCMPSDrri;
2241     unsigned BlendOpcode =
2242       (RetVT == MVT::f32) ? X86::VBLENDVPSrrr : X86::VBLENDVPDrrr;
2243 
2244     Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpRHSReg,
2245                                        CC);
2246     Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, LHSReg,
2247                                           CmpReg);
2248     ResultReg = createResultReg(RC);
2249     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2250             TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
2251   } else {
2252     // Choose the SSE instruction sequence based on data type (float or double).
2253     static const uint16_t OpcTable[2][4] = {
2254       { X86::CMPSSrri,  X86::ANDPSrr,  X86::ANDNPSrr,  X86::ORPSrr  },
2255       { X86::CMPSDrri,  X86::ANDPDrr,  X86::ANDNPDrr,  X86::ORPDrr  }
2256     };
2257 
2258     const uint16_t *Opc = nullptr;
2259     switch (RetVT.SimpleTy) {
2260     default: return false;
2261     case MVT::f32: Opc = &OpcTable[0][0]; break;
2262     case MVT::f64: Opc = &OpcTable[1][0]; break;
2263     }
2264 
2265     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
2266     Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpRHSReg, CC);
2267     Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, LHSReg);
2268     Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, RHSReg);
2269     Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, AndReg);
2270     ResultReg = createResultReg(RC);
2271     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2272             TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
2273   }
2274   updateValueMap(I, ResultReg);
2275   return true;
2276 }
2277 
X86FastEmitPseudoSelect(MVT RetVT,const Instruction * I)2278 bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
2279   // These are pseudo CMOV instructions and will be later expanded into control-
2280   // flow.
2281   unsigned Opc;
2282   switch (RetVT.SimpleTy) {
2283   default: return false;
2284   case MVT::i8:  Opc = X86::CMOV_GR8;   break;
2285   case MVT::i16: Opc = X86::CMOV_GR16;  break;
2286   case MVT::i32: Opc = X86::CMOV_GR32;  break;
2287   case MVT::f16:
2288     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;
2289   case MVT::f32:
2290     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;
2291   case MVT::f64:
2292     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;
2293   }
2294 
2295   const Value *Cond = I->getOperand(0);
2296   X86::CondCode CC = X86::COND_NE;
2297 
2298   // Optimize conditions coming from a compare if both instructions are in the
2299   // same basic block (values defined in other basic blocks may not have
2300   // initialized registers).
2301   const auto *CI = dyn_cast<CmpInst>(Cond);
2302   if (CI && (CI->getParent() == I->getParent())) {
2303     bool NeedSwap;
2304     std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
2305     if (CC > X86::LAST_VALID_COND)
2306       return false;
2307 
2308     const Value *CmpLHS = CI->getOperand(0);
2309     const Value *CmpRHS = CI->getOperand(1);
2310 
2311     if (NeedSwap)
2312       std::swap(CmpLHS, CmpRHS);
2313 
2314     EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
2315     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
2316       return false;
2317   } else {
2318     Register CondReg = getRegForValue(Cond);
2319     if (!CondReg)
2320       return false;
2321 
2322     // In case OpReg is a K register, COPY to a GPR
2323     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
2324       Register KCondReg = CondReg;
2325       CondReg = createResultReg(&X86::GR32RegClass);
2326       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2327               TII.get(TargetOpcode::COPY), CondReg)
2328           .addReg(KCondReg);
2329       CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
2330     }
2331     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
2332         .addReg(CondReg)
2333         .addImm(1);
2334   }
2335 
2336   const Value *LHS = I->getOperand(1);
2337   const Value *RHS = I->getOperand(2);
2338 
2339   Register LHSReg = getRegForValue(LHS);
2340   Register RHSReg = getRegForValue(RHS);
2341   if (!LHSReg || !RHSReg)
2342     return false;
2343 
2344   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2345 
2346   Register ResultReg =
2347     fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
2348   updateValueMap(I, ResultReg);
2349   return true;
2350 }
2351 
X86SelectSelect(const Instruction * I)2352 bool X86FastISel::X86SelectSelect(const Instruction *I) {
2353   MVT RetVT;
2354   if (!isTypeLegal(I->getType(), RetVT))
2355     return false;
2356 
2357   // Check if we can fold the select.
2358   if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
2359     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2360     const Value *Opnd = nullptr;
2361     switch (Predicate) {
2362     default:                              break;
2363     case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
2364     case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
2365     }
2366     // No need for a select anymore - this is an unconditional move.
2367     if (Opnd) {
2368       Register OpReg = getRegForValue(Opnd);
2369       if (!OpReg)
2370         return false;
2371       const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2372       Register ResultReg = createResultReg(RC);
2373       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2374               TII.get(TargetOpcode::COPY), ResultReg)
2375         .addReg(OpReg);
2376       updateValueMap(I, ResultReg);
2377       return true;
2378     }
2379   }
2380 
2381   // First try to use real conditional move instructions.
2382   if (X86FastEmitCMoveSelect(RetVT, I))
2383     return true;
2384 
2385   // Try to use a sequence of SSE instructions to simulate a conditional move.
2386   if (X86FastEmitSSESelect(RetVT, I))
2387     return true;
2388 
2389   // Fall-back to pseudo conditional move instructions, which will be later
2390   // converted to control-flow.
2391   if (X86FastEmitPseudoSelect(RetVT, I))
2392     return true;
2393 
2394   return false;
2395 }
2396 
2397 // Common code for X86SelectSIToFP and X86SelectUIToFP.
X86SelectIntToFP(const Instruction * I,bool IsSigned)2398 bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
2399   // The target-independent selection algorithm in FastISel already knows how
2400   // to select a SINT_TO_FP if the target is SSE but not AVX.
2401   // Early exit if the subtarget doesn't have AVX.
2402   // Unsigned conversion requires avx512.
2403   bool HasAVX512 = Subtarget->hasAVX512();
2404   if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512))
2405     return false;
2406 
2407   // TODO: We could sign extend narrower types.
2408   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
2409   if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
2410     return false;
2411 
2412   // Select integer to float/double conversion.
2413   Register OpReg = getRegForValue(I->getOperand(0));
2414   if (!OpReg)
2415     return false;
2416 
2417   unsigned Opcode;
2418 
2419   static const uint16_t SCvtOpc[2][2][2] = {
2420     { { X86::VCVTSI2SSrr,  X86::VCVTSI642SSrr },
2421       { X86::VCVTSI2SDrr,  X86::VCVTSI642SDrr } },
2422     { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },
2423       { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },
2424   };
2425   static const uint16_t UCvtOpc[2][2] = {
2426     { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },
2427     { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },
2428   };
2429   bool Is64Bit = SrcVT == MVT::i64;
2430 
2431   if (I->getType()->isDoubleTy()) {
2432     // s/uitofp int -> double
2433     Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];
2434   } else if (I->getType()->isFloatTy()) {
2435     // s/uitofp int -> float
2436     Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];
2437   } else
2438     return false;
2439 
2440   MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
2441   const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
2442   Register ImplicitDefReg = createResultReg(RC);
2443   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2444           TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2445   Register ResultReg = fastEmitInst_rr(Opcode, RC, ImplicitDefReg, OpReg);
2446   updateValueMap(I, ResultReg);
2447   return true;
2448 }
2449 
X86SelectSIToFP(const Instruction * I)2450 bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
2451   return X86SelectIntToFP(I, /*IsSigned*/true);
2452 }
2453 
X86SelectUIToFP(const Instruction * I)2454 bool X86FastISel::X86SelectUIToFP(const Instruction *I) {
2455   return X86SelectIntToFP(I, /*IsSigned*/false);
2456 }
2457 
2458 // Helper method used by X86SelectFPExt and X86SelectFPTrunc.
X86SelectFPExtOrFPTrunc(const Instruction * I,unsigned TargetOpc,const TargetRegisterClass * RC)2459 bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
2460                                           unsigned TargetOpc,
2461                                           const TargetRegisterClass *RC) {
2462   assert((I->getOpcode() == Instruction::FPExt ||
2463           I->getOpcode() == Instruction::FPTrunc) &&
2464          "Instruction must be an FPExt or FPTrunc!");
2465   bool HasAVX = Subtarget->hasAVX();
2466 
2467   Register OpReg = getRegForValue(I->getOperand(0));
2468   if (!OpReg)
2469     return false;
2470 
2471   Register ImplicitDefReg;
2472   if (HasAVX) {
2473     ImplicitDefReg = createResultReg(RC);
2474     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2475             TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2476 
2477   }
2478 
2479   Register ResultReg = createResultReg(RC);
2480   MachineInstrBuilder MIB;
2481   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpc),
2482                 ResultReg);
2483 
2484   if (HasAVX)
2485     MIB.addReg(ImplicitDefReg);
2486 
2487   MIB.addReg(OpReg);
2488   updateValueMap(I, ResultReg);
2489   return true;
2490 }
2491 
X86SelectFPExt(const Instruction * I)2492 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
2493   if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&
2494       I->getOperand(0)->getType()->isFloatTy()) {
2495     bool HasAVX512 = Subtarget->hasAVX512();
2496     // fpext from float to double.
2497     unsigned Opc =
2498         HasAVX512 ? X86::VCVTSS2SDZrr
2499                   : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
2500     return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));
2501   }
2502 
2503   return false;
2504 }
2505 
X86SelectFPTrunc(const Instruction * I)2506 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
2507   if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&
2508       I->getOperand(0)->getType()->isDoubleTy()) {
2509     bool HasAVX512 = Subtarget->hasAVX512();
2510     // fptrunc from double to float.
2511     unsigned Opc =
2512         HasAVX512 ? X86::VCVTSD2SSZrr
2513                   : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
2514     return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));
2515   }
2516 
2517   return false;
2518 }
2519 
X86SelectTrunc(const Instruction * I)2520 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
2521   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
2522   EVT DstVT = TLI.getValueType(DL, I->getType());
2523 
2524   // This code only handles truncation to byte.
2525   if (DstVT != MVT::i8 && DstVT != MVT::i1)
2526     return false;
2527   if (!TLI.isTypeLegal(SrcVT))
2528     return false;
2529 
2530   Register InputReg = getRegForValue(I->getOperand(0));
2531   if (!InputReg)
2532     // Unhandled operand.  Halt "fast" selection and bail.
2533     return false;
2534 
2535   if (SrcVT == MVT::i8) {
2536     // Truncate from i8 to i1; no code needed.
2537     updateValueMap(I, InputReg);
2538     return true;
2539   }
2540 
2541   // Issue an extract_subreg.
2542   Register ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg,
2543                                                   X86::sub_8bit);
2544   if (!ResultReg)
2545     return false;
2546 
2547   updateValueMap(I, ResultReg);
2548   return true;
2549 }
2550 
X86SelectBitCast(const Instruction * I)2551 bool X86FastISel::X86SelectBitCast(const Instruction *I) {
2552   // Select SSE2/AVX bitcasts between 128/256/512 bit vector types.
2553   MVT SrcVT, DstVT;
2554   if (!Subtarget->hasSSE2() ||
2555       !isTypeLegal(I->getOperand(0)->getType(), SrcVT) ||
2556       !isTypeLegal(I->getType(), DstVT))
2557     return false;
2558 
2559   // Only allow vectors that use xmm/ymm/zmm.
2560   if (!SrcVT.isVector() || !DstVT.isVector() ||
2561       SrcVT.getVectorElementType() == MVT::i1 ||
2562       DstVT.getVectorElementType() == MVT::i1)
2563     return false;
2564 
2565   Register Reg = getRegForValue(I->getOperand(0));
2566   if (!Reg)
2567     return false;
2568 
2569   // Emit a reg-reg copy so we don't propagate cached known bits information
2570   // with the wrong VT if we fall out of fast isel after selecting this.
2571   const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
2572   Register ResultReg = createResultReg(DstClass);
2573   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
2574           ResultReg)
2575       .addReg(Reg);
2576 
2577   updateValueMap(I, ResultReg);
2578   return true;
2579 }
2580 
IsMemcpySmall(uint64_t Len)2581 bool X86FastISel::IsMemcpySmall(uint64_t Len) {
2582   return Len <= (Subtarget->is64Bit() ? 32 : 16);
2583 }
2584 
TryEmitSmallMemcpy(X86AddressMode DestAM,X86AddressMode SrcAM,uint64_t Len)2585 bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
2586                                      X86AddressMode SrcAM, uint64_t Len) {
2587 
2588   // Make sure we don't bloat code by inlining very large memcpy's.
2589   if (!IsMemcpySmall(Len))
2590     return false;
2591 
2592   bool i64Legal = Subtarget->is64Bit();
2593 
2594   // We don't care about alignment here since we just emit integer accesses.
2595   while (Len) {
2596     MVT VT;
2597     if (Len >= 8 && i64Legal)
2598       VT = MVT::i64;
2599     else if (Len >= 4)
2600       VT = MVT::i32;
2601     else if (Len >= 2)
2602       VT = MVT::i16;
2603     else
2604       VT = MVT::i8;
2605 
2606     Register Reg;
2607     bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
2608     RV &= X86FastEmitStore(VT, Reg, DestAM);
2609     assert(RV && "Failed to emit load or store??");
2610     (void)RV;
2611 
2612     unsigned Size = VT.getSizeInBits()/8;
2613     Len -= Size;
2614     DestAM.Disp += Size;
2615     SrcAM.Disp += Size;
2616   }
2617 
2618   return true;
2619 }
2620 
fastLowerIntrinsicCall(const IntrinsicInst * II)2621 bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
2622   // FIXME: Handle more intrinsics.
2623   switch (II->getIntrinsicID()) {
2624   default: return false;
2625   case Intrinsic::convert_from_fp16:
2626   case Intrinsic::convert_to_fp16: {
2627     if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
2628       return false;
2629 
2630     const Value *Op = II->getArgOperand(0);
2631     Register InputReg = getRegForValue(Op);
2632     if (!InputReg)
2633       return false;
2634 
2635     // F16C only allows converting from float to half and from half to float.
2636     bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
2637     if (IsFloatToHalf) {
2638       if (!Op->getType()->isFloatTy())
2639         return false;
2640     } else {
2641       if (!II->getType()->isFloatTy())
2642         return false;
2643     }
2644 
2645     Register ResultReg;
2646     const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
2647     if (IsFloatToHalf) {
2648       // 'InputReg' is implicitly promoted from register class FR32 to
2649       // register class VR128 by method 'constrainOperandRegClass' which is
2650       // directly called by 'fastEmitInst_ri'.
2651       // Instruction VCVTPS2PHrr takes an extra immediate operand which is
2652       // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
2653       // It's consistent with the other FP instructions, which are usually
2654       // controlled by MXCSR.
2655       unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
2656                                          : X86::VCVTPS2PHrr;
2657       InputReg = fastEmitInst_ri(Opc, RC, InputReg, 4);
2658 
2659       // Move the lower 32-bits of ResultReg to another register of class GR32.
2660       Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
2661                                    : X86::VMOVPDI2DIrr;
2662       ResultReg = createResultReg(&X86::GR32RegClass);
2663       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)
2664           .addReg(InputReg, RegState::Kill);
2665 
2666       // The result value is in the lower 16-bits of ResultReg.
2667       unsigned RegIdx = X86::sub_16bit;
2668       ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, RegIdx);
2669     } else {
2670       assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
2671       // Explicitly zero-extend the input to 32-bit.
2672       InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg);
2673 
2674       // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
2675       InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
2676                             InputReg);
2677 
2678       unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
2679                                          : X86::VCVTPH2PSrr;
2680       InputReg = fastEmitInst_r(Opc, RC, InputReg);
2681 
2682       // The result value is in the lower 32-bits of ResultReg.
2683       // Emit an explicit copy from register class VR128 to register class FR32.
2684       ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
2685       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2686               TII.get(TargetOpcode::COPY), ResultReg)
2687           .addReg(InputReg, RegState::Kill);
2688     }
2689 
2690     updateValueMap(II, ResultReg);
2691     return true;
2692   }
2693   case Intrinsic::frameaddress: {
2694     MachineFunction *MF = FuncInfo.MF;
2695     if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
2696       return false;
2697 
2698     Type *RetTy = II->getCalledFunction()->getReturnType();
2699 
2700     MVT VT;
2701     if (!isTypeLegal(RetTy, VT))
2702       return false;
2703 
2704     unsigned Opc;
2705     const TargetRegisterClass *RC = nullptr;
2706 
2707     switch (VT.SimpleTy) {
2708     default: llvm_unreachable("Invalid result type for frameaddress.");
2709     case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
2710     case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
2711     }
2712 
2713     // This needs to be set before we call getPtrSizedFrameRegister, otherwise
2714     // we get the wrong frame register.
2715     MachineFrameInfo &MFI = MF->getFrameInfo();
2716     MFI.setFrameAddressIsTaken(true);
2717 
2718     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2719     Register FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
2720     assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
2721             (FrameReg == X86::EBP && VT == MVT::i32)) &&
2722            "Invalid Frame Register!");
2723 
2724     // Always make a copy of the frame register to a vreg first, so that we
2725     // never directly reference the frame register (the TwoAddressInstruction-
2726     // Pass doesn't like that).
2727     Register SrcReg = createResultReg(RC);
2728     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2729             TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
2730 
2731     // Now recursively load from the frame address.
2732     // movq (%rbp), %rax
2733     // movq (%rax), %rax
2734     // movq (%rax), %rax
2735     // ...
2736     unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
2737     while (Depth--) {
2738       Register DestReg = createResultReg(RC);
2739       addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2740                            TII.get(Opc), DestReg), SrcReg);
2741       SrcReg = DestReg;
2742     }
2743 
2744     updateValueMap(II, SrcReg);
2745     return true;
2746   }
2747   case Intrinsic::memcpy: {
2748     const MemCpyInst *MCI = cast<MemCpyInst>(II);
2749     // Don't handle volatile or variable length memcpys.
2750     if (MCI->isVolatile())
2751       return false;
2752 
2753     if (isa<ConstantInt>(MCI->getLength())) {
2754       // Small memcpy's are common enough that we want to do them
2755       // without a call if possible.
2756       uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
2757       if (IsMemcpySmall(Len)) {
2758         X86AddressMode DestAM, SrcAM;
2759         if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
2760             !X86SelectAddress(MCI->getRawSource(), SrcAM))
2761           return false;
2762         TryEmitSmallMemcpy(DestAM, SrcAM, Len);
2763         return true;
2764       }
2765     }
2766 
2767     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2768     if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
2769       return false;
2770 
2771     if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
2772       return false;
2773 
2774     return lowerCallTo(II, "memcpy", II->arg_size() - 1);
2775   }
2776   case Intrinsic::memset: {
2777     const MemSetInst *MSI = cast<MemSetInst>(II);
2778 
2779     if (MSI->isVolatile())
2780       return false;
2781 
2782     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2783     if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
2784       return false;
2785 
2786     if (MSI->getDestAddressSpace() > 255)
2787       return false;
2788 
2789     return lowerCallTo(II, "memset", II->arg_size() - 1);
2790   }
2791   case Intrinsic::stackprotector: {
2792     // Emit code to store the stack guard onto the stack.
2793     EVT PtrTy = TLI.getPointerTy(DL);
2794 
2795     const Value *Op1 = II->getArgOperand(0); // The guard's value.
2796     const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
2797 
2798     MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
2799 
2800     // Grab the frame index.
2801     X86AddressMode AM;
2802     if (!X86SelectAddress(Slot, AM)) return false;
2803     if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
2804     return true;
2805   }
2806   case Intrinsic::dbg_declare: {
2807     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
2808     X86AddressMode AM;
2809     assert(DI->getAddress() && "Null address should be checked earlier!");
2810     if (!X86SelectAddress(DI->getAddress(), AM))
2811       return false;
2812     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
2813     assert(DI->getVariable()->isValidLocationForIntrinsic(MIMD.getDL()) &&
2814            "Expected inlined-at fields to agree");
2815     addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II), AM)
2816         .addImm(0)
2817         .addMetadata(DI->getVariable())
2818         .addMetadata(DI->getExpression());
2819     return true;
2820   }
2821   case Intrinsic::trap: {
2822     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TRAP));
2823     return true;
2824   }
2825   case Intrinsic::sqrt: {
2826     if (!Subtarget->hasSSE1())
2827       return false;
2828 
2829     Type *RetTy = II->getCalledFunction()->getReturnType();
2830 
2831     MVT VT;
2832     if (!isTypeLegal(RetTy, VT))
2833       return false;
2834 
2835     // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
2836     // is not generated by FastISel yet.
2837     // FIXME: Update this code once tablegen can handle it.
2838     static const uint16_t SqrtOpc[3][2] = {
2839       { X86::SQRTSSr,   X86::SQRTSDr },
2840       { X86::VSQRTSSr,  X86::VSQRTSDr },
2841       { X86::VSQRTSSZr, X86::VSQRTSDZr },
2842     };
2843     unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
2844                         Subtarget->hasAVX()    ? 1 :
2845                                                  0;
2846     unsigned Opc;
2847     switch (VT.SimpleTy) {
2848     default: return false;
2849     case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;
2850     case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;
2851     }
2852 
2853     const Value *SrcVal = II->getArgOperand(0);
2854     Register SrcReg = getRegForValue(SrcVal);
2855 
2856     if (!SrcReg)
2857       return false;
2858 
2859     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
2860     Register ImplicitDefReg;
2861     if (AVXLevel > 0) {
2862       ImplicitDefReg = createResultReg(RC);
2863       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2864               TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2865     }
2866 
2867     Register ResultReg = createResultReg(RC);
2868     MachineInstrBuilder MIB;
2869     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),
2870                   ResultReg);
2871 
2872     if (ImplicitDefReg)
2873       MIB.addReg(ImplicitDefReg);
2874 
2875     MIB.addReg(SrcReg);
2876 
2877     updateValueMap(II, ResultReg);
2878     return true;
2879   }
2880   case Intrinsic::sadd_with_overflow:
2881   case Intrinsic::uadd_with_overflow:
2882   case Intrinsic::ssub_with_overflow:
2883   case Intrinsic::usub_with_overflow:
2884   case Intrinsic::smul_with_overflow:
2885   case Intrinsic::umul_with_overflow: {
2886     // This implements the basic lowering of the xalu with overflow intrinsics
2887     // into add/sub/mul followed by either seto or setb.
2888     const Function *Callee = II->getCalledFunction();
2889     auto *Ty = cast<StructType>(Callee->getReturnType());
2890     Type *RetTy = Ty->getTypeAtIndex(0U);
2891     assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
2892            Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
2893            "Overflow value expected to be an i1");
2894 
2895     MVT VT;
2896     if (!isTypeLegal(RetTy, VT))
2897       return false;
2898 
2899     if (VT < MVT::i8 || VT > MVT::i64)
2900       return false;
2901 
2902     const Value *LHS = II->getArgOperand(0);
2903     const Value *RHS = II->getArgOperand(1);
2904 
2905     // Canonicalize immediate to the RHS.
2906     if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
2907       std::swap(LHS, RHS);
2908 
2909     unsigned BaseOpc, CondCode;
2910     switch (II->getIntrinsicID()) {
2911     default: llvm_unreachable("Unexpected intrinsic!");
2912     case Intrinsic::sadd_with_overflow:
2913       BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;
2914     case Intrinsic::uadd_with_overflow:
2915       BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;
2916     case Intrinsic::ssub_with_overflow:
2917       BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;
2918     case Intrinsic::usub_with_overflow:
2919       BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;
2920     case Intrinsic::smul_with_overflow:
2921       BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;
2922     case Intrinsic::umul_with_overflow:
2923       BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
2924     }
2925 
2926     Register LHSReg = getRegForValue(LHS);
2927     if (!LHSReg)
2928       return false;
2929 
2930     Register ResultReg;
2931     // Check if we have an immediate version.
2932     if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
2933       static const uint16_t Opc[2][4] = {
2934         { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
2935         { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
2936       };
2937 
2938       if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) &&
2939           CondCode == X86::COND_O) {
2940         // We can use INC/DEC.
2941         ResultReg = createResultReg(TLI.getRegClassFor(VT));
2942         bool IsDec = BaseOpc == ISD::SUB;
2943         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2944                 TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
2945           .addReg(LHSReg);
2946       } else
2947         ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, CI->getZExtValue());
2948     }
2949 
2950     Register RHSReg;
2951     if (!ResultReg) {
2952       RHSReg = getRegForValue(RHS);
2953       if (!RHSReg)
2954         return false;
2955       ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, RHSReg);
2956     }
2957 
2958     // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
2959     // it manually.
2960     if (BaseOpc == X86ISD::UMUL && !ResultReg) {
2961       static const uint16_t MULOpc[] =
2962         { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
2963       static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
2964       // First copy the first operand into RAX, which is an implicit input to
2965       // the X86::MUL*r instruction.
2966       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2967               TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
2968         .addReg(LHSReg);
2969       ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
2970                                  TLI.getRegClassFor(VT), RHSReg);
2971     } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
2972       static const uint16_t MULOpc[] =
2973         { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
2974       if (VT == MVT::i8) {
2975         // Copy the first operand into AL, which is an implicit input to the
2976         // X86::IMUL8r instruction.
2977         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2978                TII.get(TargetOpcode::COPY), X86::AL)
2979           .addReg(LHSReg);
2980         ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg);
2981       } else
2982         ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
2983                                     TLI.getRegClassFor(VT), LHSReg, RHSReg);
2984     }
2985 
2986     if (!ResultReg)
2987       return false;
2988 
2989     // Assign to a GPR since the overflow return value is lowered to a SETcc.
2990     Register ResultReg2 = createResultReg(&X86::GR8RegClass);
2991     assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
2992     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2993             ResultReg2).addImm(CondCode);
2994 
2995     updateValueMap(II, ResultReg, 2);
2996     return true;
2997   }
2998   case Intrinsic::x86_sse_cvttss2si:
2999   case Intrinsic::x86_sse_cvttss2si64:
3000   case Intrinsic::x86_sse2_cvttsd2si:
3001   case Intrinsic::x86_sse2_cvttsd2si64: {
3002     bool IsInputDouble;
3003     switch (II->getIntrinsicID()) {
3004     default: llvm_unreachable("Unexpected intrinsic.");
3005     case Intrinsic::x86_sse_cvttss2si:
3006     case Intrinsic::x86_sse_cvttss2si64:
3007       if (!Subtarget->hasSSE1())
3008         return false;
3009       IsInputDouble = false;
3010       break;
3011     case Intrinsic::x86_sse2_cvttsd2si:
3012     case Intrinsic::x86_sse2_cvttsd2si64:
3013       if (!Subtarget->hasSSE2())
3014         return false;
3015       IsInputDouble = true;
3016       break;
3017     }
3018 
3019     Type *RetTy = II->getCalledFunction()->getReturnType();
3020     MVT VT;
3021     if (!isTypeLegal(RetTy, VT))
3022       return false;
3023 
3024     static const uint16_t CvtOpc[3][2][2] = {
3025       { { X86::CVTTSS2SIrr,   X86::CVTTSS2SI64rr },
3026         { X86::CVTTSD2SIrr,   X86::CVTTSD2SI64rr } },
3027       { { X86::VCVTTSS2SIrr,  X86::VCVTTSS2SI64rr },
3028         { X86::VCVTTSD2SIrr,  X86::VCVTTSD2SI64rr } },
3029       { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },
3030         { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },
3031     };
3032     unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
3033                         Subtarget->hasAVX()    ? 1 :
3034                                                  0;
3035     unsigned Opc;
3036     switch (VT.SimpleTy) {
3037     default: llvm_unreachable("Unexpected result type.");
3038     case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;
3039     case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;
3040     }
3041 
3042     // Check if we can fold insertelement instructions into the convert.
3043     const Value *Op = II->getArgOperand(0);
3044     while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
3045       const Value *Index = IE->getOperand(2);
3046       if (!isa<ConstantInt>(Index))
3047         break;
3048       unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
3049 
3050       if (!Idx) {
3051         Op = IE->getOperand(1);
3052         break;
3053       }
3054       Op = IE->getOperand(0);
3055     }
3056 
3057     Register Reg = getRegForValue(Op);
3058     if (!Reg)
3059       return false;
3060 
3061     Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3062     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)
3063       .addReg(Reg);
3064 
3065     updateValueMap(II, ResultReg);
3066     return true;
3067   }
3068   case Intrinsic::x86_sse42_crc32_32_8:
3069   case Intrinsic::x86_sse42_crc32_32_16:
3070   case Intrinsic::x86_sse42_crc32_32_32:
3071   case Intrinsic::x86_sse42_crc32_64_64: {
3072     if (!Subtarget->hasCRC32())
3073       return false;
3074 
3075     Type *RetTy = II->getCalledFunction()->getReturnType();
3076 
3077     MVT VT;
3078     if (!isTypeLegal(RetTy, VT))
3079       return false;
3080 
3081     unsigned Opc;
3082     const TargetRegisterClass *RC = nullptr;
3083 
3084     switch (II->getIntrinsicID()) {
3085     default:
3086       llvm_unreachable("Unexpected intrinsic.");
3087 #define GET_EGPR_IF_ENABLED(OPC) Subtarget->hasEGPR() ? OPC##_EVEX : OPC
3088     case Intrinsic::x86_sse42_crc32_32_8:
3089       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r8);
3090       RC = &X86::GR32RegClass;
3091       break;
3092     case Intrinsic::x86_sse42_crc32_32_16:
3093       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r16);
3094       RC = &X86::GR32RegClass;
3095       break;
3096     case Intrinsic::x86_sse42_crc32_32_32:
3097       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r32);
3098       RC = &X86::GR32RegClass;
3099       break;
3100     case Intrinsic::x86_sse42_crc32_64_64:
3101       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r64r64);
3102       RC = &X86::GR64RegClass;
3103       break;
3104 #undef GET_EGPR_IF_ENABLED
3105     }
3106 
3107     const Value *LHS = II->getArgOperand(0);
3108     const Value *RHS = II->getArgOperand(1);
3109 
3110     Register LHSReg = getRegForValue(LHS);
3111     Register RHSReg = getRegForValue(RHS);
3112     if (!LHSReg || !RHSReg)
3113       return false;
3114 
3115     Register ResultReg = fastEmitInst_rr(Opc, RC, LHSReg, RHSReg);
3116     if (!ResultReg)
3117       return false;
3118 
3119     updateValueMap(II, ResultReg);
3120     return true;
3121   }
3122   }
3123 }
3124 
fastLowerArguments()3125 bool X86FastISel::fastLowerArguments() {
3126   if (!FuncInfo.CanLowerReturn)
3127     return false;
3128 
3129   const Function *F = FuncInfo.Fn;
3130   if (F->isVarArg())
3131     return false;
3132 
3133   CallingConv::ID CC = F->getCallingConv();
3134   if (CC != CallingConv::C)
3135     return false;
3136 
3137   if (Subtarget->isCallingConvWin64(CC))
3138     return false;
3139 
3140   if (!Subtarget->is64Bit())
3141     return false;
3142 
3143   if (Subtarget->useSoftFloat())
3144     return false;
3145 
3146   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
3147   unsigned GPRCnt = 0;
3148   unsigned FPRCnt = 0;
3149   for (auto const &Arg : F->args()) {
3150     if (Arg.hasAttribute(Attribute::ByVal) ||
3151         Arg.hasAttribute(Attribute::InReg) ||
3152         Arg.hasAttribute(Attribute::StructRet) ||
3153         Arg.hasAttribute(Attribute::SwiftSelf) ||
3154         Arg.hasAttribute(Attribute::SwiftAsync) ||
3155         Arg.hasAttribute(Attribute::SwiftError) ||
3156         Arg.hasAttribute(Attribute::Nest))
3157       return false;
3158 
3159     Type *ArgTy = Arg.getType();
3160     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
3161       return false;
3162 
3163     EVT ArgVT = TLI.getValueType(DL, ArgTy);
3164     if (!ArgVT.isSimple()) return false;
3165     switch (ArgVT.getSimpleVT().SimpleTy) {
3166     default: return false;
3167     case MVT::i32:
3168     case MVT::i64:
3169       ++GPRCnt;
3170       break;
3171     case MVT::f32:
3172     case MVT::f64:
3173       if (!Subtarget->hasSSE1())
3174         return false;
3175       ++FPRCnt;
3176       break;
3177     }
3178 
3179     if (GPRCnt > 6)
3180       return false;
3181 
3182     if (FPRCnt > 8)
3183       return false;
3184   }
3185 
3186   static const MCPhysReg GPR32ArgRegs[] = {
3187     X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
3188   };
3189   static const MCPhysReg GPR64ArgRegs[] = {
3190     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
3191   };
3192   static const MCPhysReg XMMArgRegs[] = {
3193     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3194     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3195   };
3196 
3197   unsigned GPRIdx = 0;
3198   unsigned FPRIdx = 0;
3199   for (auto const &Arg : F->args()) {
3200     MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
3201     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
3202     MCRegister SrcReg;
3203     switch (VT.SimpleTy) {
3204     default: llvm_unreachable("Unexpected value type.");
3205     case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
3206     case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
3207     case MVT::f32: [[fallthrough]];
3208     case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
3209     }
3210     Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
3211     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
3212     // Without this, EmitLiveInCopies may eliminate the livein if its only
3213     // use is a bitcast (which isn't turned into an instruction).
3214     Register ResultReg = createResultReg(RC);
3215     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3216             TII.get(TargetOpcode::COPY), ResultReg)
3217       .addReg(DstReg, getKillRegState(true));
3218     updateValueMap(&Arg, ResultReg);
3219   }
3220   return true;
3221 }
3222 
computeBytesPoppedByCalleeForSRet(const X86Subtarget * Subtarget,CallingConv::ID CC,const CallBase * CB)3223 static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
3224                                                   CallingConv::ID CC,
3225                                                   const CallBase *CB) {
3226   if (Subtarget->is64Bit())
3227     return 0;
3228   if (Subtarget->getTargetTriple().isOSMSVCRT())
3229     return 0;
3230   if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3231       CC == CallingConv::HiPE || CC == CallingConv::Tail ||
3232       CC == CallingConv::SwiftTail)
3233     return 0;
3234 
3235   if (CB)
3236     if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) ||
3237         CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
3238       return 0;
3239 
3240   return 4;
3241 }
3242 
fastLowerCall(CallLoweringInfo & CLI)3243 bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
3244   auto &OutVals       = CLI.OutVals;
3245   auto &OutFlags      = CLI.OutFlags;
3246   auto &OutRegs       = CLI.OutRegs;
3247   auto &Ins           = CLI.Ins;
3248   auto &InRegs        = CLI.InRegs;
3249   CallingConv::ID CC  = CLI.CallConv;
3250   bool &IsTailCall    = CLI.IsTailCall;
3251   bool IsVarArg       = CLI.IsVarArg;
3252   const Value *Callee = CLI.Callee;
3253   MCSymbol *Symbol    = CLI.Symbol;
3254   const auto *CB      = CLI.CB;
3255 
3256   bool Is64Bit        = Subtarget->is64Bit();
3257   bool IsWin64        = Subtarget->isCallingConvWin64(CC);
3258 
3259   // Call / invoke instructions with NoCfCheck attribute require special
3260   // handling.
3261   if (CB && CB->doesNoCfCheck())
3262     return false;
3263 
3264   // Functions with no_caller_saved_registers that need special handling.
3265   if ((CB && isa<CallInst>(CB) && CB->hasFnAttr("no_caller_saved_registers")))
3266     return false;
3267 
3268   // Functions with no_callee_saved_registers that need special handling.
3269   if ((CB && CB->hasFnAttr("no_callee_saved_registers")))
3270     return false;
3271 
3272   // Indirect calls with CFI checks need special handling.
3273   if (CB && CB->isIndirectCall() && CB->getOperandBundle(LLVMContext::OB_kcfi))
3274     return false;
3275 
3276   // Functions using thunks for indirect calls need to use SDISel.
3277   if (Subtarget->useIndirectThunkCalls())
3278     return false;
3279 
3280   // Handle only C and fastcc calling conventions for now.
3281   switch (CC) {
3282   default: return false;
3283   case CallingConv::C:
3284   case CallingConv::Fast:
3285   case CallingConv::Tail:
3286   case CallingConv::Swift:
3287   case CallingConv::SwiftTail:
3288   case CallingConv::X86_FastCall:
3289   case CallingConv::X86_StdCall:
3290   case CallingConv::X86_ThisCall:
3291   case CallingConv::Win64:
3292   case CallingConv::X86_64_SysV:
3293   case CallingConv::CFGuard_Check:
3294     break;
3295   }
3296 
3297   // Allow SelectionDAG isel to handle tail calls.
3298   if (IsTailCall)
3299     return false;
3300 
3301   // fastcc with -tailcallopt is intended to provide a guaranteed
3302   // tail call optimization. Fastisel doesn't know how to do that.
3303   if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
3304       CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
3305     return false;
3306 
3307   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
3308   // x86-32. Special handling for x86-64 is implemented.
3309   if (IsVarArg && IsWin64)
3310     return false;
3311 
3312   // Don't know about inalloca yet.
3313   if (CLI.CB && CLI.CB->hasInAllocaArgument())
3314     return false;
3315 
3316   for (auto Flag : CLI.OutFlags)
3317     if (Flag.isSwiftError() || Flag.isPreallocated())
3318       return false;
3319 
3320   // Can't handle import call optimization.
3321   if (Is64Bit &&
3322       MF->getFunction().getParent()->getModuleFlag("import-call-optimization"))
3323     return false;
3324 
3325   SmallVector<MVT, 16> OutVTs;
3326   SmallVector<Register, 16> ArgRegs;
3327 
3328   // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
3329   // instruction. This is safe because it is common to all FastISel supported
3330   // calling conventions on x86.
3331   for (int i = 0, e = OutVals.size(); i != e; ++i) {
3332     Value *&Val = OutVals[i];
3333     ISD::ArgFlagsTy Flags = OutFlags[i];
3334     if (auto *CI = dyn_cast<ConstantInt>(Val)) {
3335       if (CI->getBitWidth() < 32) {
3336         if (Flags.isSExt())
3337           Val = ConstantInt::get(CI->getContext(), CI->getValue().sext(32));
3338         else
3339           Val = ConstantInt::get(CI->getContext(), CI->getValue().zext(32));
3340       }
3341     }
3342 
3343     // Passing bools around ends up doing a trunc to i1 and passing it.
3344     // Codegen this as an argument + "and 1".
3345     MVT VT;
3346     auto *TI = dyn_cast<TruncInst>(Val);
3347     Register ResultReg;
3348     if (TI && TI->getType()->isIntegerTy(1) && CLI.CB &&
3349         (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) {
3350       Value *PrevVal = TI->getOperand(0);
3351       ResultReg = getRegForValue(PrevVal);
3352 
3353       if (!ResultReg)
3354         return false;
3355 
3356       if (!isTypeLegal(PrevVal->getType(), VT))
3357         return false;
3358 
3359       ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, 1);
3360     } else {
3361       if (!isTypeLegal(Val->getType(), VT) ||
3362           (VT.isVector() && VT.getVectorElementType() == MVT::i1))
3363         return false;
3364       ResultReg = getRegForValue(Val);
3365     }
3366 
3367     if (!ResultReg)
3368       return false;
3369 
3370     ArgRegs.push_back(ResultReg);
3371     OutVTs.push_back(VT);
3372   }
3373 
3374   // Analyze operands of the call, assigning locations to each operand.
3375   SmallVector<CCValAssign, 16> ArgLocs;
3376   CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
3377 
3378   // Allocate shadow area for Win64
3379   if (IsWin64)
3380     CCInfo.AllocateStack(32, Align(8));
3381 
3382   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
3383 
3384   // Get a count of how many bytes are to be pushed on the stack.
3385   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3386 
3387   // Issue CALLSEQ_START
3388   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
3389   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackDown))
3390     .addImm(NumBytes).addImm(0).addImm(0);
3391 
3392   // Walk the register/memloc assignments, inserting copies/loads.
3393   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3394   for (const CCValAssign &VA : ArgLocs) {
3395     const Value *ArgVal = OutVals[VA.getValNo()];
3396     MVT ArgVT = OutVTs[VA.getValNo()];
3397 
3398     if (ArgVT == MVT::x86mmx)
3399       return false;
3400 
3401     Register ArgReg = ArgRegs[VA.getValNo()];
3402 
3403     // Promote the value if needed.
3404     switch (VA.getLocInfo()) {
3405     case CCValAssign::Full: break;
3406     case CCValAssign::SExt: {
3407       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3408              "Unexpected extend");
3409 
3410       if (ArgVT == MVT::i1)
3411         return false;
3412 
3413       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
3414                                        ArgVT, ArgReg);
3415       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
3416       ArgVT = VA.getLocVT();
3417       break;
3418     }
3419     case CCValAssign::ZExt: {
3420       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3421              "Unexpected extend");
3422 
3423       // Handle zero-extension from i1 to i8, which is common.
3424       if (ArgVT == MVT::i1) {
3425         // Set the high bits to zero.
3426         ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg);
3427         ArgVT = MVT::i8;
3428 
3429         if (!ArgReg)
3430           return false;
3431       }
3432 
3433       bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
3434                                        ArgVT, ArgReg);
3435       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
3436       ArgVT = VA.getLocVT();
3437       break;
3438     }
3439     case CCValAssign::AExt: {
3440       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3441              "Unexpected extend");
3442       bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
3443                                        ArgVT, ArgReg);
3444       if (!Emitted)
3445         Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
3446                                     ArgVT, ArgReg);
3447       if (!Emitted)
3448         Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
3449                                     ArgVT, ArgReg);
3450 
3451       assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
3452       ArgVT = VA.getLocVT();
3453       break;
3454     }
3455     case CCValAssign::BCvt: {
3456       ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg);
3457       assert(ArgReg && "Failed to emit a bitcast!");
3458       ArgVT = VA.getLocVT();
3459       break;
3460     }
3461     case CCValAssign::VExt:
3462       // VExt has not been implemented, so this should be impossible to reach
3463       // for now.  However, fallback to Selection DAG isel once implemented.
3464       return false;
3465     case CCValAssign::AExtUpper:
3466     case CCValAssign::SExtUpper:
3467     case CCValAssign::ZExtUpper:
3468     case CCValAssign::FPExt:
3469     case CCValAssign::Trunc:
3470       llvm_unreachable("Unexpected loc info!");
3471     case CCValAssign::Indirect:
3472       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
3473       // support this.
3474       return false;
3475     }
3476 
3477     if (VA.isRegLoc()) {
3478       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3479               TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
3480       OutRegs.push_back(VA.getLocReg());
3481     } else {
3482       assert(VA.isMemLoc() && "Unknown value location!");
3483 
3484       // Don't emit stores for undef values.
3485       if (isa<UndefValue>(ArgVal))
3486         continue;
3487 
3488       unsigned LocMemOffset = VA.getLocMemOffset();
3489       X86AddressMode AM;
3490       AM.Base.Reg = RegInfo->getStackRegister();
3491       AM.Disp = LocMemOffset;
3492       ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
3493       Align Alignment = DL.getABITypeAlign(ArgVal->getType());
3494       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
3495           MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
3496           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
3497       if (Flags.isByVal()) {
3498         X86AddressMode SrcAM;
3499         SrcAM.Base.Reg = ArgReg;
3500         if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
3501           return false;
3502       } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
3503         // If this is a really simple value, emit this with the Value* version
3504         // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
3505         // as it can cause us to reevaluate the argument.
3506         if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
3507           return false;
3508       } else {
3509         if (!X86FastEmitStore(ArgVT, ArgReg, AM, MMO))
3510           return false;
3511       }
3512     }
3513   }
3514 
3515   // ELF / PIC requires GOT in the EBX register before function calls via PLT
3516   // GOT pointer.
3517   if (Subtarget->isPICStyleGOT()) {
3518     Register Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3519     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3520             TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
3521   }
3522 
3523   if (Is64Bit && IsVarArg && !IsWin64) {
3524     // From AMD64 ABI document:
3525     // For calls that may call functions that use varargs or stdargs
3526     // (prototype-less calls or calls to functions containing ellipsis (...) in
3527     // the declaration) %al is used as hidden argument to specify the number
3528     // of SSE registers used. The contents of %al do not need to match exactly
3529     // the number of registers, but must be an ubound on the number of SSE
3530     // registers used and is in the range 0 - 8 inclusive.
3531 
3532     // Count the number of XMM registers allocated.
3533     static const MCPhysReg XMMArgRegs[] = {
3534       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3535       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3536     };
3537     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3538     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3539            && "SSE registers cannot be used when SSE is disabled");
3540     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),
3541             X86::AL).addImm(NumXMMRegs);
3542   }
3543 
3544   // Materialize callee address in a register. FIXME: GV address can be
3545   // handled with a CALLpcrel32 instead.
3546   X86AddressMode CalleeAM;
3547   if (!X86SelectCallAddress(Callee, CalleeAM))
3548     return false;
3549 
3550   Register CalleeOp;
3551   const GlobalValue *GV = nullptr;
3552   if (CalleeAM.GV != nullptr) {
3553     GV = CalleeAM.GV;
3554   } else if (CalleeAM.Base.Reg) {
3555     CalleeOp = CalleeAM.Base.Reg;
3556   } else
3557     return false;
3558 
3559   // Issue the call.
3560   MachineInstrBuilder MIB;
3561   if (CalleeOp) {
3562     // Register-indirect call.
3563     unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
3564     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))
3565       .addReg(CalleeOp);
3566   } else {
3567     // Direct call.
3568     assert(GV && "Not a direct call");
3569     // See if we need any target-specific flags on the GV operand.
3570     unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
3571     if (OpFlags == X86II::MO_PLT && !Is64Bit &&
3572         TM.getRelocationModel() == Reloc::Static && isa<Function>(GV) &&
3573         cast<Function>(GV)->isIntrinsic())
3574       OpFlags = X86II::MO_NO_FLAG;
3575 
3576     // This will be a direct call, or an indirect call through memory for
3577     // NonLazyBind calls or dllimport calls.
3578     bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||
3579                     OpFlags == X86II::MO_GOTPCREL ||
3580                     OpFlags == X86II::MO_GOTPCREL_NORELAX ||
3581                     OpFlags == X86II::MO_COFFSTUB;
3582     unsigned CallOpc = NeedLoad
3583                            ? (Is64Bit ? X86::CALL64m : X86::CALL32m)
3584                            : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
3585 
3586     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc));
3587     if (NeedLoad)
3588       MIB.addReg(Is64Bit ? X86::RIP : X86::NoRegister).addImm(1).addReg(0);
3589     if (Symbol)
3590       MIB.addSym(Symbol, OpFlags);
3591     else
3592       MIB.addGlobalAddress(GV, 0, OpFlags);
3593     if (NeedLoad)
3594       MIB.addReg(0);
3595   }
3596 
3597   // Add a register mask operand representing the call-preserved registers.
3598   // Proper defs for return values will be added by setPhysRegsDeadExcept().
3599   MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
3600 
3601   // Add an implicit use GOT pointer in EBX.
3602   if (Subtarget->isPICStyleGOT())
3603     MIB.addReg(X86::EBX, RegState::Implicit);
3604 
3605   if (Is64Bit && IsVarArg && !IsWin64)
3606     MIB.addReg(X86::AL, RegState::Implicit);
3607 
3608   // Add implicit physical register uses to the call.
3609   for (auto Reg : OutRegs)
3610     MIB.addReg(Reg, RegState::Implicit);
3611 
3612   // Issue CALLSEQ_END
3613   unsigned NumBytesForCalleeToPop =
3614       X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
3615                        TM.Options.GuaranteedTailCallOpt)
3616           ? NumBytes // Callee pops everything.
3617           : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB);
3618   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
3619   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackUp))
3620     .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
3621 
3622   // Now handle call return values.
3623   SmallVector<CCValAssign, 16> RVLocs;
3624   CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
3625                     CLI.RetTy->getContext());
3626   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
3627 
3628   // Copy all of the result registers out of their specified physreg.
3629   Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
3630   for (unsigned i = 0; i != RVLocs.size(); ++i) {
3631     CCValAssign &VA = RVLocs[i];
3632     EVT CopyVT = VA.getValVT();
3633     Register CopyReg = ResultReg + i;
3634     Register SrcReg = VA.getLocReg();
3635 
3636     // If this is x86-64, and we disabled SSE, we can't return FP values
3637     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
3638         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
3639       report_fatal_error("SSE register return with SSE disabled");
3640     }
3641 
3642     // If we prefer to use the value in xmm registers, copy it out as f80 and
3643     // use a truncate to move it from fp stack reg to xmm reg.
3644     if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
3645         isScalarFPTypeInSSEReg(VA.getValVT())) {
3646       CopyVT = MVT::f80;
3647       CopyReg = createResultReg(&X86::RFP80RegClass);
3648     }
3649 
3650     // Copy out the result.
3651     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3652             TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
3653     InRegs.push_back(VA.getLocReg());
3654 
3655     // Round the f80 to the right size, which also moves it to the appropriate
3656     // xmm register. This is accomplished by storing the f80 value in memory
3657     // and then loading it back.
3658     if (CopyVT != VA.getValVT()) {
3659       EVT ResVT = VA.getValVT();
3660       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
3661       unsigned MemSize = ResVT.getSizeInBits()/8;
3662       int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false);
3663       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3664                                 TII.get(Opc)), FI)
3665         .addReg(CopyReg);
3666       Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;
3667       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3668                                 TII.get(Opc), ResultReg + i), FI);
3669     }
3670   }
3671 
3672   CLI.ResultReg = ResultReg;
3673   CLI.NumResultRegs = RVLocs.size();
3674   CLI.Call = MIB;
3675 
3676   return true;
3677 }
3678 
3679 bool
fastSelectInstruction(const Instruction * I)3680 X86FastISel::fastSelectInstruction(const Instruction *I)  {
3681   switch (I->getOpcode()) {
3682   default: break;
3683   case Instruction::Load:
3684     return X86SelectLoad(I);
3685   case Instruction::Store:
3686     return X86SelectStore(I);
3687   case Instruction::Ret:
3688     return X86SelectRet(I);
3689   case Instruction::ICmp:
3690   case Instruction::FCmp:
3691     return X86SelectCmp(I);
3692   case Instruction::ZExt:
3693     return X86SelectZExt(I);
3694   case Instruction::SExt:
3695     return X86SelectSExt(I);
3696   case Instruction::Br:
3697     return X86SelectBranch(I);
3698   case Instruction::LShr:
3699   case Instruction::AShr:
3700   case Instruction::Shl:
3701     return X86SelectShift(I);
3702   case Instruction::SDiv:
3703   case Instruction::UDiv:
3704   case Instruction::SRem:
3705   case Instruction::URem:
3706     return X86SelectDivRem(I);
3707   case Instruction::Select:
3708     return X86SelectSelect(I);
3709   case Instruction::Trunc:
3710     return X86SelectTrunc(I);
3711   case Instruction::FPExt:
3712     return X86SelectFPExt(I);
3713   case Instruction::FPTrunc:
3714     return X86SelectFPTrunc(I);
3715   case Instruction::SIToFP:
3716     return X86SelectSIToFP(I);
3717   case Instruction::UIToFP:
3718     return X86SelectUIToFP(I);
3719   case Instruction::IntToPtr: // Deliberate fall-through.
3720   case Instruction::PtrToInt: {
3721     EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
3722     EVT DstVT = TLI.getValueType(DL, I->getType());
3723     if (DstVT.bitsGT(SrcVT))
3724       return X86SelectZExt(I);
3725     if (DstVT.bitsLT(SrcVT))
3726       return X86SelectTrunc(I);
3727     Register Reg = getRegForValue(I->getOperand(0));
3728     if (!Reg)
3729       return false;
3730     updateValueMap(I, Reg);
3731     return true;
3732   }
3733   case Instruction::BitCast:
3734     return X86SelectBitCast(I);
3735   }
3736 
3737   return false;
3738 }
3739 
X86MaterializeInt(const ConstantInt * CI,MVT VT)3740 Register X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
3741   if (VT > MVT::i64)
3742     return Register();
3743 
3744   uint64_t Imm = CI->getZExtValue();
3745   if (Imm == 0) {
3746     Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
3747     switch (VT.SimpleTy) {
3748     default: llvm_unreachable("Unexpected value type");
3749     case MVT::i1:
3750     case MVT::i8:
3751       return fastEmitInst_extractsubreg(MVT::i8, SrcReg, X86::sub_8bit);
3752     case MVT::i16:
3753       return fastEmitInst_extractsubreg(MVT::i16, SrcReg, X86::sub_16bit);
3754     case MVT::i32:
3755       return SrcReg;
3756     case MVT::i64: {
3757       Register ResultReg = createResultReg(&X86::GR64RegClass);
3758       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3759               TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
3760         .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
3761       return ResultReg;
3762     }
3763     }
3764   }
3765 
3766   unsigned Opc = 0;
3767   switch (VT.SimpleTy) {
3768   default: llvm_unreachable("Unexpected value type");
3769   case MVT::i1:
3770     VT = MVT::i8;
3771     [[fallthrough]];
3772   case MVT::i8:  Opc = X86::MOV8ri;  break;
3773   case MVT::i16: Opc = X86::MOV16ri; break;
3774   case MVT::i32: Opc = X86::MOV32ri; break;
3775   case MVT::i64: {
3776     if (isUInt<32>(Imm))
3777       Opc = X86::MOV32ri64;
3778     else if (isInt<32>(Imm))
3779       Opc = X86::MOV64ri32;
3780     else
3781       Opc = X86::MOV64ri;
3782     break;
3783   }
3784   }
3785   return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
3786 }
3787 
X86MaterializeFP(const ConstantFP * CFP,MVT VT)3788 Register X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
3789   if (CFP->isNullValue())
3790     return fastMaterializeFloatZero(CFP);
3791 
3792   // Can't handle alternate code models yet.
3793   CodeModel::Model CM = TM.getCodeModel();
3794   if (CM != CodeModel::Small && CM != CodeModel::Medium &&
3795       CM != CodeModel::Large)
3796     return Register();
3797 
3798   // Get opcode and regclass of the output for the given load instruction.
3799   unsigned Opc = 0;
3800   bool HasSSE1 = Subtarget->hasSSE1();
3801   bool HasSSE2 = Subtarget->hasSSE2();
3802   bool HasAVX = Subtarget->hasAVX();
3803   bool HasAVX512 = Subtarget->hasAVX512();
3804   switch (VT.SimpleTy) {
3805   default:
3806     return Register();
3807   case MVT::f32:
3808     Opc = HasAVX512 ? X86::VMOVSSZrm_alt
3809           : HasAVX  ? X86::VMOVSSrm_alt
3810           : HasSSE1 ? X86::MOVSSrm_alt
3811                     : X86::LD_Fp32m;
3812     break;
3813   case MVT::f64:
3814     Opc = HasAVX512 ? X86::VMOVSDZrm_alt
3815           : HasAVX  ? X86::VMOVSDrm_alt
3816           : HasSSE2 ? X86::MOVSDrm_alt
3817                     : X86::LD_Fp64m;
3818     break;
3819   case MVT::f80:
3820     // No f80 support yet.
3821     return Register();
3822   }
3823 
3824   // MachineConstantPool wants an explicit alignment.
3825   Align Alignment = DL.getPrefTypeAlign(CFP->getType());
3826 
3827   // x86-32 PIC requires a PIC base register for constant pools.
3828   Register PICBase;
3829   unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
3830   if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
3831     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3832   else if (OpFlag == X86II::MO_GOTOFF)
3833     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3834   else if (Subtarget->is64Bit() && TM.getCodeModel() != CodeModel::Large)
3835     PICBase = X86::RIP;
3836 
3837   // Create the load from the constant pool.
3838   unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);
3839   Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
3840 
3841   // Large code model only applies to 64-bit mode.
3842   if (Subtarget->is64Bit() && CM == CodeModel::Large) {
3843     Register AddrReg = createResultReg(&X86::GR64RegClass);
3844     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),
3845             AddrReg)
3846       .addConstantPoolIndex(CPI, 0, OpFlag);
3847     MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3848                                       TII.get(Opc), ResultReg);
3849     addRegReg(MIB, AddrReg, false, X86::NoSubRegister, PICBase, false,
3850               X86::NoSubRegister);
3851     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
3852         MachinePointerInfo::getConstantPool(*FuncInfo.MF),
3853         MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
3854     MIB->addMemOperand(*FuncInfo.MF, MMO);
3855     return ResultReg;
3856   }
3857 
3858   addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3859                                    TII.get(Opc), ResultReg),
3860                            CPI, PICBase, OpFlag);
3861   return ResultReg;
3862 }
3863 
X86MaterializeGV(const GlobalValue * GV,MVT VT)3864 Register X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
3865   // Can't handle large GlobalValues yet.
3866   if (TM.getCodeModel() != CodeModel::Small &&
3867       TM.getCodeModel() != CodeModel::Medium)
3868     return Register();
3869   if (TM.isLargeGlobalValue(GV))
3870     return Register();
3871 
3872   // Materialize addresses with LEA/MOV instructions.
3873   X86AddressMode AM;
3874   if (X86SelectAddress(GV, AM)) {
3875     // If the expression is just a basereg, then we're done, otherwise we need
3876     // to emit an LEA.
3877     if (AM.BaseType == X86AddressMode::RegBase &&
3878         AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
3879       return AM.Base.Reg;
3880 
3881     Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3882     if (TM.getRelocationModel() == Reloc::Static &&
3883         TLI.getPointerTy(DL) == MVT::i64) {
3884       // The displacement code could be more than 32 bits away so we need to use
3885       // an instruction with a 64 bit immediate
3886       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),
3887               ResultReg)
3888         .addGlobalAddress(GV);
3889     } else {
3890       unsigned Opc =
3891           TLI.getPointerTy(DL) == MVT::i32
3892               ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
3893               : X86::LEA64r;
3894       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3895                              TII.get(Opc), ResultReg), AM);
3896     }
3897     return ResultReg;
3898   }
3899   return Register();
3900 }
3901 
fastMaterializeConstant(const Constant * C)3902 Register X86FastISel::fastMaterializeConstant(const Constant *C) {
3903   EVT CEVT = TLI.getValueType(DL, C->getType(), true);
3904 
3905   // Only handle simple types.
3906   if (!CEVT.isSimple())
3907     return Register();
3908   MVT VT = CEVT.getSimpleVT();
3909 
3910   if (const auto *CI = dyn_cast<ConstantInt>(C))
3911     return X86MaterializeInt(CI, VT);
3912   if (const auto *CFP = dyn_cast<ConstantFP>(C))
3913     return X86MaterializeFP(CFP, VT);
3914   if (const auto *GV = dyn_cast<GlobalValue>(C))
3915     return X86MaterializeGV(GV, VT);
3916   if (isa<UndefValue>(C)) {
3917     unsigned Opc = 0;
3918     switch (VT.SimpleTy) {
3919     default:
3920       break;
3921     case MVT::f32:
3922       if (!Subtarget->hasSSE1())
3923         Opc = X86::LD_Fp032;
3924       break;
3925     case MVT::f64:
3926       if (!Subtarget->hasSSE2())
3927         Opc = X86::LD_Fp064;
3928       break;
3929     case MVT::f80:
3930       Opc = X86::LD_Fp080;
3931       break;
3932     }
3933 
3934     if (Opc) {
3935       Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3936       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),
3937               ResultReg);
3938       return ResultReg;
3939     }
3940   }
3941 
3942   return Register();
3943 }
3944 
fastMaterializeAlloca(const AllocaInst * C)3945 Register X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
3946   // Fail on dynamic allocas. At this point, getRegForValue has already
3947   // checked its CSE maps, so if we're here trying to handle a dynamic
3948   // alloca, we're not going to succeed. X86SelectAddress has a
3949   // check for dynamic allocas, because it's called directly from
3950   // various places, but targetMaterializeAlloca also needs a check
3951   // in order to avoid recursion between getRegForValue,
3952   // X86SelectAddrss, and targetMaterializeAlloca.
3953   if (!FuncInfo.StaticAllocaMap.count(C))
3954     return Register();
3955   assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
3956 
3957   X86AddressMode AM;
3958   if (!X86SelectAddress(C, AM))
3959     return Register();
3960   unsigned Opc =
3961       TLI.getPointerTy(DL) == MVT::i32
3962           ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
3963           : X86::LEA64r;
3964   const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
3965   Register ResultReg = createResultReg(RC);
3966   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3967                          TII.get(Opc), ResultReg), AM);
3968   return ResultReg;
3969 }
3970 
fastMaterializeFloatZero(const ConstantFP * CF)3971 Register X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
3972   MVT VT;
3973   if (!isTypeLegal(CF->getType(), VT))
3974     return Register();
3975 
3976   // Get opcode and regclass for the given zero.
3977   bool HasSSE1 = Subtarget->hasSSE1();
3978   bool HasSSE2 = Subtarget->hasSSE2();
3979   bool HasAVX512 = Subtarget->hasAVX512();
3980   unsigned Opc = 0;
3981   switch (VT.SimpleTy) {
3982   default: return 0;
3983   case MVT::f16:
3984     Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;
3985     break;
3986   case MVT::f32:
3987     Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
3988           : HasSSE1 ? X86::FsFLD0SS
3989                     : X86::LD_Fp032;
3990     break;
3991   case MVT::f64:
3992     Opc = HasAVX512 ? X86::AVX512_FsFLD0SD
3993           : HasSSE2 ? X86::FsFLD0SD
3994                     : X86::LD_Fp064;
3995     break;
3996   case MVT::f80:
3997     // No f80 support yet.
3998     return Register();
3999   }
4000 
4001   Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
4002   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);
4003   return ResultReg;
4004 }
4005 
tryToFoldLoadIntoMI(MachineInstr * MI,unsigned OpNo,const LoadInst * LI)4006 bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
4007                                       const LoadInst *LI) {
4008   const Value *Ptr = LI->getPointerOperand();
4009   X86AddressMode AM;
4010   if (!X86SelectAddress(Ptr, AM))
4011     return false;
4012 
4013   const X86InstrInfo &XII = (const X86InstrInfo &)TII;
4014 
4015   unsigned Size = DL.getTypeAllocSize(LI->getType());
4016 
4017   SmallVector<MachineOperand, 8> AddrOps;
4018   AM.getFullAddress(AddrOps);
4019 
4020   MachineInstr *Result = XII.foldMemoryOperandImpl(
4021       *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(),
4022       /*AllowCommute=*/true);
4023   if (!Result)
4024     return false;
4025 
4026   // The index register could be in the wrong register class.  Unfortunately,
4027   // foldMemoryOperandImpl could have commuted the instruction so its not enough
4028   // to just look at OpNo + the offset to the index reg.  We actually need to
4029   // scan the instruction to find the index reg and see if its the correct reg
4030   // class.
4031   unsigned OperandNo = 0;
4032   for (MachineInstr::mop_iterator I = Result->operands_begin(),
4033        E = Result->operands_end(); I != E; ++I, ++OperandNo) {
4034     MachineOperand &MO = *I;
4035     if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
4036       continue;
4037     // Found the index reg, now try to rewrite it.
4038     Register IndexReg = constrainOperandRegClass(Result->getDesc(),
4039                                                  MO.getReg(), OperandNo);
4040     if (IndexReg == MO.getReg())
4041       continue;
4042     MO.setReg(IndexReg);
4043   }
4044 
4045   Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
4046   Result->cloneInstrSymbols(*FuncInfo.MF, *MI);
4047   MachineBasicBlock::iterator I(MI);
4048   removeDeadCode(I, std::next(I));
4049   return true;
4050 }
4051 
fastEmitInst_rrrr(unsigned MachineInstOpcode,const TargetRegisterClass * RC,Register Op0,Register Op1,Register Op2,Register Op3)4052 Register X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
4053                                         const TargetRegisterClass *RC,
4054                                         Register Op0, Register Op1,
4055                                         Register Op2, Register Op3) {
4056   const MCInstrDesc &II = TII.get(MachineInstOpcode);
4057 
4058   Register ResultReg = createResultReg(RC);
4059   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
4060   Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
4061   Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
4062   Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);
4063 
4064   if (II.getNumDefs() >= 1)
4065     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
4066         .addReg(Op0)
4067         .addReg(Op1)
4068         .addReg(Op2)
4069         .addReg(Op3);
4070   else {
4071     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
4072         .addReg(Op0)
4073         .addReg(Op1)
4074         .addReg(Op2)
4075         .addReg(Op3);
4076     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
4077             ResultReg)
4078         .addReg(II.implicit_defs()[0]);
4079   }
4080   return ResultReg;
4081 }
4082 
4083 namespace llvm {
createFastISel(FunctionLoweringInfo & funcInfo,const TargetLibraryInfo * libInfo)4084   FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
4085                                 const TargetLibraryInfo *libInfo) {
4086     return new X86FastISel(funcInfo, libInfo);
4087   }
4088 }
4089