xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLoweringCall.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 //===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file implements the lowering of LLVM calls to DAG nodes.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "MCTargetDesc/X86MCAsmInfo.h"
15 #include "X86.h"
16 #include "X86CallingConv.h"
17 #include "X86FrameLowering.h"
18 #include "X86ISelLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86TargetMachine.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/ObjCARCUtil.h"
24 #include "llvm/CodeGen/MachineJumpTableInfo.h"
25 #include "llvm/CodeGen/MachineModuleInfo.h"
26 #include "llvm/CodeGen/WinEHFuncInfo.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IRBuilder.h"
29 #include "llvm/IR/Module.h"
30 
31 #define DEBUG_TYPE "x86-isel"
32 
33 using namespace llvm;
34 
35 STATISTIC(NumTailCalls, "Number of tail calls");
36 
37 /// Call this when the user attempts to do something unsupported, like
38 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
39 /// report_fatal_error, so calling code should attempt to recover without
40 /// crashing.
errorUnsupported(SelectionDAG & DAG,const SDLoc & dl,const char * Msg)41 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
42                              const char *Msg) {
43   MachineFunction &MF = DAG.getMachineFunction();
44   DAG.getContext()->diagnose(
45       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
46 }
47 
48 /// Returns true if a CC can dynamically exclude a register from the list of
49 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
50 /// the return registers.
shouldDisableRetRegFromCSR(CallingConv::ID CC)51 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
52   switch (CC) {
53   default:
54     return false;
55   case CallingConv::X86_RegCall:
56   case CallingConv::PreserveMost:
57   case CallingConv::PreserveAll:
58     return true;
59   }
60 }
61 
62 /// Returns true if a CC can dynamically exclude a register from the list of
63 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
64 /// the parameters.
shouldDisableArgRegFromCSR(CallingConv::ID CC)65 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
66   return CC == CallingConv::X86_RegCall;
67 }
68 
69 static std::pair<MVT, unsigned>
handleMaskRegisterForCallingConv(unsigned NumElts,CallingConv::ID CC,const X86Subtarget & Subtarget)70 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
71                                  const X86Subtarget &Subtarget) {
72   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
73   // convention is one that uses k registers.
74   if (NumElts == 2)
75     return {MVT::v2i64, 1};
76   if (NumElts == 4)
77     return {MVT::v4i32, 1};
78   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
79       CC != CallingConv::Intel_OCL_BI)
80     return {MVT::v8i16, 1};
81   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
82       CC != CallingConv::Intel_OCL_BI)
83     return {MVT::v16i8, 1};
84   // v32i1 passes in ymm unless we have BWI and the calling convention is
85   // regcall.
86   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
87     return {MVT::v32i8, 1};
88   // Split v64i1 vectors if we don't have v64i8 available.
89   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
90     if (Subtarget.useAVX512Regs())
91       return {MVT::v64i8, 1};
92     return {MVT::v32i8, 2};
93   }
94 
95   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
96   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
97       NumElts > 64)
98     return {MVT::i8, NumElts};
99 
100   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
101 }
102 
getRegisterTypeForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const103 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
104                                                      CallingConv::ID CC,
105                                                      EVT VT) const {
106   if (VT.isVector()) {
107     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
108       unsigned NumElts = VT.getVectorNumElements();
109 
110       MVT RegisterVT;
111       unsigned NumRegisters;
112       std::tie(RegisterVT, NumRegisters) =
113           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
114       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
115         return RegisterVT;
116     }
117 
118     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
119       return MVT::v8f16;
120   }
121 
122   // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
123   if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
124       !Subtarget.hasX87())
125     return MVT::i32;
126 
127   if (isTypeLegal(MVT::f16)) {
128     if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
129       return getRegisterTypeForCallingConv(
130           Context, CC, VT.changeVectorElementType(MVT::f16));
131 
132     if (VT == MVT::bf16)
133       return MVT::f16;
134   }
135 
136   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
137 }
138 
getNumRegistersForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const139 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
140                                                           CallingConv::ID CC,
141                                                           EVT VT) const {
142   if (VT.isVector()) {
143     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
144       unsigned NumElts = VT.getVectorNumElements();
145 
146       MVT RegisterVT;
147       unsigned NumRegisters;
148       std::tie(RegisterVT, NumRegisters) =
149           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
150       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
151         return NumRegisters;
152     }
153 
154     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
155       return 1;
156   }
157 
158   // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
159   // x87 is disabled.
160   if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
161     if (VT == MVT::f64)
162       return 2;
163     if (VT == MVT::f80)
164       return 3;
165   }
166 
167   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
168       isTypeLegal(MVT::f16))
169     return getNumRegistersForCallingConv(Context, CC,
170                                          VT.changeVectorElementType(MVT::f16));
171 
172   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
173 }
174 
getVectorTypeBreakdownForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT,EVT & IntermediateVT,unsigned & NumIntermediates,MVT & RegisterVT) const175 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
176     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
177     unsigned &NumIntermediates, MVT &RegisterVT) const {
178   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
179   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
180       Subtarget.hasAVX512() &&
181       (!isPowerOf2_32(VT.getVectorNumElements()) ||
182        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
183        VT.getVectorNumElements() > 64)) {
184     RegisterVT = MVT::i8;
185     IntermediateVT = MVT::i1;
186     NumIntermediates = VT.getVectorNumElements();
187     return NumIntermediates;
188   }
189 
190   // Split v64i1 vectors if we don't have v64i8 available.
191   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
192       CC != CallingConv::X86_RegCall) {
193     RegisterVT = MVT::v32i8;
194     IntermediateVT = MVT::v32i1;
195     NumIntermediates = 2;
196     return 2;
197   }
198 
199   // Split vNbf16 vectors according to vNf16.
200   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
201       isTypeLegal(MVT::f16))
202     VT = VT.changeVectorElementType(MVT::f16);
203 
204   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
205                                               NumIntermediates, RegisterVT);
206 }
207 
getSetCCResultType(const DataLayout & DL,LLVMContext & Context,EVT VT) const208 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
209                                           LLVMContext& Context,
210                                           EVT VT) const {
211   if (!VT.isVector())
212     return MVT::i8;
213 
214   if (Subtarget.hasAVX512()) {
215     // Figure out what this type will be legalized to.
216     EVT LegalVT = VT;
217     while (getTypeAction(Context, LegalVT) != TypeLegal)
218       LegalVT = getTypeToTransformTo(Context, LegalVT);
219 
220     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
221     if (LegalVT.getSimpleVT().is512BitVector())
222       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
223 
224     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
225       // If we legalized to less than a 512-bit vector, then we will use a vXi1
226       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
227       // vXi16/vXi8.
228       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
229       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
230         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
231     }
232   }
233 
234   return VT.changeVectorElementTypeToInteger();
235 }
236 
functionArgumentNeedsConsecutiveRegisters(Type * Ty,CallingConv::ID CallConv,bool isVarArg,const DataLayout & DL) const237 bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
238     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
239     const DataLayout &DL) const {
240   // On x86-64 i128 is split into two i64s and needs to be allocated to two
241   // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
242   // is split to four i32s and never actually passed in registers, but we use
243   // the consecutive register mark to match it in TableGen.
244   if (Ty->isIntegerTy(128))
245     return true;
246 
247   // On x86-32, fp128 acts the same as i128.
248   if (Subtarget.is32Bit() && Ty->isFP128Ty())
249     return true;
250 
251   return false;
252 }
253 
254 /// Helper for getByValTypeAlignment to determine
255 /// the desired ByVal argument alignment.
getMaxByValAlign(Type * Ty,Align & MaxAlign)256 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
257   if (MaxAlign == 16)
258     return;
259   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
260     if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
261       MaxAlign = Align(16);
262   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
263     Align EltAlign;
264     getMaxByValAlign(ATy->getElementType(), EltAlign);
265     if (EltAlign > MaxAlign)
266       MaxAlign = EltAlign;
267   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
268     for (auto *EltTy : STy->elements()) {
269       Align EltAlign;
270       getMaxByValAlign(EltTy, EltAlign);
271       if (EltAlign > MaxAlign)
272         MaxAlign = EltAlign;
273       if (MaxAlign == 16)
274         break;
275     }
276   }
277 }
278 
279 /// Return the desired alignment for ByVal aggregate
280 /// function arguments in the caller parameter area. For X86, aggregates
281 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
282 /// are at 4-byte boundaries.
getByValTypeAlignment(Type * Ty,const DataLayout & DL) const283 Align X86TargetLowering::getByValTypeAlignment(Type *Ty,
284                                                const DataLayout &DL) const {
285   if (Subtarget.is64Bit())
286     return std::max(DL.getABITypeAlign(Ty), Align::Constant<8>());
287 
288   Align Alignment(4);
289   if (Subtarget.hasSSE1())
290     getMaxByValAlign(Ty, Alignment);
291   return Alignment;
292 }
293 
294 /// It returns EVT::Other if the type should be determined using generic
295 /// target-independent logic.
296 /// For vector ops we check that the overall size isn't larger than our
297 /// preferred vector width.
getOptimalMemOpType(LLVMContext & Context,const MemOp & Op,const AttributeList & FuncAttributes) const298 EVT X86TargetLowering::getOptimalMemOpType(
299     LLVMContext &Context, const MemOp &Op,
300     const AttributeList &FuncAttributes) const {
301   if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
302     if (Op.size() >= 16 &&
303         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
304       // FIXME: Check if unaligned 64-byte accesses are slow.
305       if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
306           (Subtarget.getPreferVectorWidth() >= 512)) {
307         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
308       }
309       // FIXME: Check if unaligned 32-byte accesses are slow.
310       if (Op.size() >= 32 && Subtarget.hasAVX() &&
311           Subtarget.useLight256BitInstructions()) {
312         // Although this isn't a well-supported type for AVX1, we'll let
313         // legalization and shuffle lowering produce the optimal codegen. If we
314         // choose an optimal type with a vector element larger than a byte,
315         // getMemsetStores() may create an intermediate splat (using an integer
316         // multiply) before we splat as a vector.
317         return MVT::v32i8;
318       }
319       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
320         return MVT::v16i8;
321       // TODO: Can SSE1 handle a byte vector?
322       // If we have SSE1 registers we should be able to use them.
323       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
324           (Subtarget.getPreferVectorWidth() >= 128))
325         return MVT::v4f32;
326     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
327                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
328       // Do not use f64 to lower memcpy if source is string constant. It's
329       // better to use i32 to avoid the loads.
330       // Also, do not use f64 to lower memset unless this is a memset of zeros.
331       // The gymnastics of splatting a byte value into an XMM register and then
332       // only using 8-byte stores (because this is a CPU with slow unaligned
333       // 16-byte accesses) makes that a loser.
334       return MVT::f64;
335     }
336   }
337   // This is a compromise. If we reach here, unaligned accesses may be slow on
338   // this target. However, creating smaller, aligned accesses could be even
339   // slower and would certainly be a lot more code.
340   if (Subtarget.is64Bit() && Op.size() >= 8)
341     return MVT::i64;
342   return MVT::i32;
343 }
344 
isSafeMemOpType(MVT VT) const345 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
346   if (VT == MVT::f32)
347     return Subtarget.hasSSE1();
348   if (VT == MVT::f64)
349     return Subtarget.hasSSE2();
350   return true;
351 }
352 
isBitAligned(Align Alignment,uint64_t SizeInBits)353 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
354   return (8 * Alignment.value()) % SizeInBits == 0;
355 }
356 
isMemoryAccessFast(EVT VT,Align Alignment) const357 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
358   if (isBitAligned(Alignment, VT.getSizeInBits()))
359     return true;
360   switch (VT.getSizeInBits()) {
361   default:
362     // 8-byte and under are always assumed to be fast.
363     return true;
364   case 128:
365     return !Subtarget.isUnalignedMem16Slow();
366   case 256:
367     return !Subtarget.isUnalignedMem32Slow();
368     // TODO: What about AVX-512 (512-bit) accesses?
369   }
370 }
371 
allowsMisalignedMemoryAccesses(EVT VT,unsigned,Align Alignment,MachineMemOperand::Flags Flags,unsigned * Fast) const372 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
373     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
374     unsigned *Fast) const {
375   if (Fast)
376     *Fast = isMemoryAccessFast(VT, Alignment);
377   // NonTemporal vector memory ops must be aligned.
378   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
379     // NT loads can only be vector aligned, so if its less aligned than the
380     // minimum vector size (which we can split the vector down to), we might as
381     // well use a regular unaligned vector load.
382     // We don't have any NT loads pre-SSE41.
383     if (!!(Flags & MachineMemOperand::MOLoad))
384       return (Alignment < 16 || !Subtarget.hasSSE41());
385     return false;
386   }
387   // Misaligned accesses of any size are always allowed.
388   return true;
389 }
390 
allowsMemoryAccess(LLVMContext & Context,const DataLayout & DL,EVT VT,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,unsigned * Fast) const391 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
392                                            const DataLayout &DL, EVT VT,
393                                            unsigned AddrSpace, Align Alignment,
394                                            MachineMemOperand::Flags Flags,
395                                            unsigned *Fast) const {
396   if (Fast)
397     *Fast = isMemoryAccessFast(VT, Alignment);
398   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
399     if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
400                                        /*Fast=*/nullptr))
401       return true;
402     // NonTemporal vector memory ops are special, and must be aligned.
403     if (!isBitAligned(Alignment, VT.getSizeInBits()))
404       return false;
405     switch (VT.getSizeInBits()) {
406     case 128:
407       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
408         return true;
409       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
410         return true;
411       return false;
412     case 256:
413       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
414         return true;
415       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
416         return true;
417       return false;
418     case 512:
419       if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
420         return true;
421       return false;
422     default:
423       return false; // Don't have NonTemporal vector memory ops of this size.
424     }
425   }
426   return true;
427 }
428 
429 /// Return the entry encoding for a jump table in the
430 /// current function.  The returned value is a member of the
431 /// MachineJumpTableInfo::JTEntryKind enum.
getJumpTableEncoding() const432 unsigned X86TargetLowering::getJumpTableEncoding() const {
433   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
434   // symbol.
435   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
436     return MachineJumpTableInfo::EK_Custom32;
437   if (isPositionIndependent() &&
438       getTargetMachine().getCodeModel() == CodeModel::Large &&
439       !Subtarget.isTargetCOFF())
440     return MachineJumpTableInfo::EK_LabelDifference64;
441 
442   // Otherwise, use the normal jump table encoding heuristics.
443   return TargetLowering::getJumpTableEncoding();
444 }
445 
useSoftFloat() const446 bool X86TargetLowering::useSoftFloat() const {
447   return Subtarget.useSoftFloat();
448 }
449 
markLibCallAttributes(MachineFunction * MF,unsigned CC,ArgListTy & Args) const450 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
451                                               ArgListTy &Args) const {
452 
453   // Only relabel X86-32 for C / Stdcall CCs.
454   if (Subtarget.is64Bit())
455     return;
456   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
457     return;
458   unsigned ParamRegs = 0;
459   if (auto *M = MF->getFunction().getParent())
460     ParamRegs = M->getNumberRegisterParameters();
461 
462   // Mark the first N int arguments as having reg
463   for (auto &Arg : Args) {
464     Type *T = Arg.Ty;
465     if (T->isIntOrPtrTy())
466       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
467         unsigned numRegs = 1;
468         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
469           numRegs = 2;
470         if (ParamRegs < numRegs)
471           return;
472         ParamRegs -= numRegs;
473         Arg.IsInReg = true;
474       }
475   }
476 }
477 
478 const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * MJTI,const MachineBasicBlock * MBB,unsigned uid,MCContext & Ctx) const479 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
480                                              const MachineBasicBlock *MBB,
481                                              unsigned uid,MCContext &Ctx) const{
482   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
483   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
484   // entries.
485   return MCSymbolRefExpr::create(MBB->getSymbol(), X86::S_GOTOFF, Ctx);
486 }
487 
488 /// Returns relocation base for the given PIC jumptable.
getPICJumpTableRelocBase(SDValue Table,SelectionDAG & DAG) const489 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
490                                                     SelectionDAG &DAG) const {
491   if (!Subtarget.is64Bit())
492     // This doesn't have SDLoc associated with it, but is not really the
493     // same as a Register.
494     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
495                        getPointerTy(DAG.getDataLayout()));
496   return Table;
497 }
498 
499 /// This returns the relocation base for the given PIC jumptable,
500 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
501 const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction * MF,unsigned JTI,MCContext & Ctx) const502 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
503                              MCContext &Ctx) const {
504   // X86-64 uses RIP relative addressing based on the jump table label.
505   if (Subtarget.isPICStyleRIPRel() ||
506       (Subtarget.is64Bit() &&
507        getTargetMachine().getCodeModel() == CodeModel::Large))
508     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
509 
510   // Otherwise, the reference is relative to the PIC base.
511   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
512 }
513 
514 std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo * TRI,MVT VT) const515 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
516                                            MVT VT) const {
517   const TargetRegisterClass *RRC = nullptr;
518   uint8_t Cost = 1;
519   switch (VT.SimpleTy) {
520   default:
521     return TargetLowering::findRepresentativeClass(TRI, VT);
522   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
523     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
524     break;
525   case MVT::x86mmx:
526     RRC = &X86::VR64RegClass;
527     break;
528   case MVT::f32: case MVT::f64:
529   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
530   case MVT::v4f32: case MVT::v2f64:
531   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
532   case MVT::v8f32: case MVT::v4f64:
533   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
534   case MVT::v16f32: case MVT::v8f64:
535     RRC = &X86::VR128XRegClass;
536     break;
537   }
538   return std::make_pair(RRC, Cost);
539 }
540 
getAddressSpace() const541 unsigned X86TargetLowering::getAddressSpace() const {
542   if (Subtarget.is64Bit())
543     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? X86AS::GS
544                                                                     : X86AS::FS;
545   return X86AS::GS;
546 }
547 
hasStackGuardSlotTLS(const Triple & TargetTriple)548 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
549   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
550          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
551 }
552 
SegmentOffset(IRBuilderBase & IRB,int Offset,unsigned AddressSpace)553 static Constant* SegmentOffset(IRBuilderBase &IRB,
554                                int Offset, unsigned AddressSpace) {
555   return ConstantExpr::getIntToPtr(
556       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
557       IRB.getPtrTy(AddressSpace));
558 }
559 
getIRStackGuard(IRBuilderBase & IRB) const560 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
561   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
562   // tcbhead_t; use it instead of the usual global variable (see
563   // sysdeps/{i386,x86_64}/nptl/tls.h)
564   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
565     unsigned AddressSpace = getAddressSpace();
566 
567     // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
568     if (Subtarget.isTargetFuchsia())
569       return SegmentOffset(IRB, 0x10, AddressSpace);
570 
571     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
572     // Specially, some users may customize the base reg and offset.
573     int Offset = M->getStackProtectorGuardOffset();
574     // If we don't set -stack-protector-guard-offset value:
575     // %fs:0x28, unless we're using a Kernel code model, in which case
576     // it's %gs:0x28.  gs:0x14 on i386.
577     if (Offset == INT_MAX)
578       Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
579 
580     StringRef GuardReg = M->getStackProtectorGuardReg();
581     if (GuardReg == "fs")
582       AddressSpace = X86AS::FS;
583     else if (GuardReg == "gs")
584       AddressSpace = X86AS::GS;
585 
586     // Use symbol guard if user specify.
587     StringRef GuardSymb = M->getStackProtectorGuardSymbol();
588     if (!GuardSymb.empty()) {
589       GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
590       if (!GV) {
591         Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
592                                        : Type::getInt32Ty(M->getContext());
593         GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
594                                 nullptr, GuardSymb, nullptr,
595                                 GlobalValue::NotThreadLocal, AddressSpace);
596         if (!Subtarget.isTargetDarwin())
597           GV->setDSOLocal(M->getDirectAccessExternalData());
598       }
599       return GV;
600     }
601 
602     return SegmentOffset(IRB, Offset, AddressSpace);
603   }
604   return TargetLowering::getIRStackGuard(IRB);
605 }
606 
insertSSPDeclarations(Module & M) const607 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
608   // MSVC CRT provides functionalities for stack protection.
609   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
610       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
611     // MSVC CRT has a global variable holding security cookie.
612     M.getOrInsertGlobal("__security_cookie",
613                         PointerType::getUnqual(M.getContext()));
614 
615     // MSVC CRT has a function to validate security cookie.
616     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
617         "__security_check_cookie", Type::getVoidTy(M.getContext()),
618         PointerType::getUnqual(M.getContext()));
619     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
620       F->setCallingConv(CallingConv::X86_FastCall);
621       F->addParamAttr(0, Attribute::AttrKind::InReg);
622     }
623     return;
624   }
625 
626   StringRef GuardMode = M.getStackProtectorGuard();
627 
628   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
629   if ((GuardMode == "tls" || GuardMode.empty()) &&
630       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
631     return;
632   TargetLowering::insertSSPDeclarations(M);
633 }
634 
getSDagStackGuard(const Module & M) const635 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
636   // MSVC CRT has a global variable holding security cookie.
637   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
638       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
639     return M.getGlobalVariable("__security_cookie");
640   }
641   return TargetLowering::getSDagStackGuard(M);
642 }
643 
getSSPStackGuardCheck(const Module & M) const644 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
645   // MSVC CRT has a function to validate security cookie.
646   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
647       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
648     return M.getFunction("__security_check_cookie");
649   }
650   return TargetLowering::getSSPStackGuardCheck(M);
651 }
652 
653 Value *
getSafeStackPointerLocation(IRBuilderBase & IRB) const654 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
655   // Android provides a fixed TLS slot for the SafeStack pointer. See the
656   // definition of TLS_SLOT_SAFESTACK in
657   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
658   if (Subtarget.isTargetAndroid()) {
659     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
660     // %gs:0x24 on i386
661     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
662     return SegmentOffset(IRB, Offset, getAddressSpace());
663   }
664 
665   // Fuchsia is similar.
666   if (Subtarget.isTargetFuchsia()) {
667     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
668     return SegmentOffset(IRB, 0x18, getAddressSpace());
669   }
670 
671   return TargetLowering::getSafeStackPointerLocation(IRB);
672 }
673 
674 //===----------------------------------------------------------------------===//
675 //               Return Value Calling Convention Implementation
676 //===----------------------------------------------------------------------===//
677 
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context,const Type * RetTy) const678 bool X86TargetLowering::CanLowerReturn(
679     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
680     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
681     const Type *RetTy) const {
682   SmallVector<CCValAssign, 16> RVLocs;
683   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
684   return CCInfo.CheckReturn(Outs, RetCC_X86);
685 }
686 
getScratchRegisters(CallingConv::ID) const687 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
688   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
689   return ScratchRegs;
690 }
691 
getRoundingControlRegisters() const692 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
693   static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR};
694   return RCRegs;
695 }
696 
697 /// Lowers masks values (v*i1) to the local register values
698 /// \returns DAG node after lowering to register type
lowerMasksToReg(const SDValue & ValArg,const EVT & ValLoc,const SDLoc & DL,SelectionDAG & DAG)699 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
700                                const SDLoc &DL, SelectionDAG &DAG) {
701   EVT ValVT = ValArg.getValueType();
702 
703   if (ValVT == MVT::v1i1)
704     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg,
705                        DAG.getIntPtrConstant(0, DL));
706 
707   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
708       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
709     // Two stage lowering might be required
710     // bitcast:   v8i1 -> i8 / v16i1 -> i16
711     // anyextend: i8   -> i32 / i16   -> i32
712     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
713     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
714     if (ValLoc == MVT::i32)
715       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy);
716     return ValToCopy;
717   }
718 
719   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
720       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
721     // One stage lowering is required
722     // bitcast:   v32i1 -> i32 / v64i1 -> i64
723     return DAG.getBitcast(ValLoc, ValArg);
724   }
725 
726   return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg);
727 }
728 
729 /// Breaks v64i1 value into two registers and adds the new node to the DAG
Passv64i1ArgInRegs(const SDLoc & DL,SelectionDAG & DAG,SDValue & Arg,SmallVectorImpl<std::pair<Register,SDValue>> & RegsToPass,CCValAssign & VA,CCValAssign & NextVA,const X86Subtarget & Subtarget)730 static void Passv64i1ArgInRegs(
731     const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
732     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
733     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
734   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
735   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
736   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
737   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
738          "The value should reside in two registers");
739 
740   // Before splitting the value we cast it to i64
741   Arg = DAG.getBitcast(MVT::i64, Arg);
742 
743   // Splitting the value into two i32 types
744   SDValue Lo, Hi;
745   std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32);
746 
747   // Attach the two i32 types into corresponding registers
748   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
749   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
750 }
751 
752 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & dl,SelectionDAG & DAG) const753 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
754                                bool isVarArg,
755                                const SmallVectorImpl<ISD::OutputArg> &Outs,
756                                const SmallVectorImpl<SDValue> &OutVals,
757                                const SDLoc &dl, SelectionDAG &DAG) const {
758   MachineFunction &MF = DAG.getMachineFunction();
759   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
760 
761   // In some cases we need to disable registers from the default CSR list.
762   // For example, when they are used as return registers (preserve_* and X86's
763   // regcall) or for argument passing (X86's regcall).
764   bool ShouldDisableCalleeSavedRegister =
765       shouldDisableRetRegFromCSR(CallConv) ||
766       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
767 
768   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
769     report_fatal_error("X86 interrupts may not return any value");
770 
771   SmallVector<CCValAssign, 16> RVLocs;
772   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
773   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
774 
775   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
776   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
777        ++I, ++OutsIndex) {
778     CCValAssign &VA = RVLocs[I];
779     assert(VA.isRegLoc() && "Can only return in registers!");
780 
781     // Add the register to the CalleeSaveDisableRegs list.
782     if (ShouldDisableCalleeSavedRegister)
783       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
784 
785     SDValue ValToCopy = OutVals[OutsIndex];
786     EVT ValVT = ValToCopy.getValueType();
787 
788     // Promote values to the appropriate types.
789     if (VA.getLocInfo() == CCValAssign::SExt)
790       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
791     else if (VA.getLocInfo() == CCValAssign::ZExt)
792       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
793     else if (VA.getLocInfo() == CCValAssign::AExt) {
794       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
795         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
796       else
797         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
798     }
799     else if (VA.getLocInfo() == CCValAssign::BCvt)
800       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
801 
802     assert(VA.getLocInfo() != CCValAssign::FPExt &&
803            "Unexpected FP-extend for return value.");
804 
805     // Report an error if we have attempted to return a value via an XMM
806     // register and SSE was disabled.
807     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
808       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
809       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
810     } else if (!Subtarget.hasSSE2() &&
811                X86::FR64XRegClass.contains(VA.getLocReg()) &&
812                ValVT == MVT::f64) {
813       // When returning a double via an XMM register, report an error if SSE2 is
814       // not enabled.
815       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
816       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
817     }
818 
819     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
820     // the RET instruction and handled by the FP Stackifier.
821     if (VA.getLocReg() == X86::FP0 ||
822         VA.getLocReg() == X86::FP1) {
823       // If this is a copy from an xmm register to ST(0), use an FPExtend to
824       // change the value to the FP stack register class.
825       if (isScalarFPTypeInSSEReg(VA.getValVT()))
826         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
827       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
828       // Don't emit a copytoreg.
829       continue;
830     }
831 
832     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
833     // which is returned in RAX / RDX.
834     if (Subtarget.is64Bit()) {
835       if (ValVT == MVT::x86mmx) {
836         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
837           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
838           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
839                                   ValToCopy);
840           // If we don't have SSE2 available, convert to v4f32 so the generated
841           // register is legal.
842           if (!Subtarget.hasSSE2())
843             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
844         }
845       }
846     }
847 
848     if (VA.needsCustom()) {
849       assert(VA.getValVT() == MVT::v64i1 &&
850              "Currently the only custom case is when we split v64i1 to 2 regs");
851 
852       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
853                          Subtarget);
854 
855       // Add the second register to the CalleeSaveDisableRegs list.
856       if (ShouldDisableCalleeSavedRegister)
857         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
858     } else {
859       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
860     }
861   }
862 
863   SDValue Glue;
864   SmallVector<SDValue, 6> RetOps;
865   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
866   // Operand #1 = Bytes To Pop
867   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
868                    MVT::i32));
869 
870   // Copy the result values into the output registers.
871   for (auto &RetVal : RetVals) {
872     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
873       RetOps.push_back(RetVal.second);
874       continue; // Don't emit a copytoreg.
875     }
876 
877     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
878     Glue = Chain.getValue(1);
879     RetOps.push_back(
880         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
881   }
882 
883   // Swift calling convention does not require we copy the sret argument
884   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
885 
886   // All x86 ABIs require that for returning structs by value we copy
887   // the sret argument into %rax/%eax (depending on ABI) for the return.
888   // We saved the argument into a virtual register in the entry block,
889   // so now we copy the value out and into %rax/%eax.
890   //
891   // Checking Function.hasStructRetAttr() here is insufficient because the IR
892   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
893   // false, then an sret argument may be implicitly inserted in the SelDAG. In
894   // either case FuncInfo->setSRetReturnReg() will have been called.
895   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
896     // When we have both sret and another return value, we should use the
897     // original Chain stored in RetOps[0], instead of the current Chain updated
898     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
899 
900     // For the case of sret and another return value, we have
901     //   Chain_0 at the function entry
902     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
903     // If we use Chain_1 in getCopyFromReg, we will have
904     //   Val = getCopyFromReg(Chain_1)
905     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
906 
907     // getCopyToReg(Chain_0) will be glued together with
908     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
909     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
910     //   Data dependency from Unit B to Unit A due to usage of Val in
911     //     getCopyToReg(Chain_1, Val)
912     //   Chain dependency from Unit A to Unit B
913 
914     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
915     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
916                                      getPointerTy(MF.getDataLayout()));
917 
918     Register RetValReg
919         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
920           X86::RAX : X86::EAX;
921     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
922     Glue = Chain.getValue(1);
923 
924     // RAX/EAX now acts like a return value.
925     RetOps.push_back(
926         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
927 
928     // Add the returned register to the CalleeSaveDisableRegs list. Don't do
929     // this however for preserve_most/preserve_all to minimize the number of
930     // callee-saved registers for these CCs.
931     if (ShouldDisableCalleeSavedRegister &&
932         CallConv != CallingConv::PreserveAll &&
933         CallConv != CallingConv::PreserveMost)
934       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
935   }
936 
937   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
938   const MCPhysReg *I =
939       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
940   if (I) {
941     for (; *I; ++I) {
942       if (X86::GR64RegClass.contains(*I))
943         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
944       else
945         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
946     }
947   }
948 
949   RetOps[0] = Chain;  // Update chain.
950 
951   // Add the glue if we have it.
952   if (Glue.getNode())
953     RetOps.push_back(Glue);
954 
955   X86ISD::NodeType opcode = X86ISD::RET_GLUE;
956   if (CallConv == CallingConv::X86_INTR)
957     opcode = X86ISD::IRET;
958   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
959 }
960 
isUsedByReturnOnly(SDNode * N,SDValue & Chain) const961 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
962   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
963     return false;
964 
965   SDValue TCChain = Chain;
966   SDNode *Copy = *N->user_begin();
967   if (Copy->getOpcode() == ISD::CopyToReg) {
968     // If the copy has a glue operand, we conservatively assume it isn't safe to
969     // perform a tail call.
970     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
971       return false;
972     TCChain = Copy->getOperand(0);
973   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
974     return false;
975 
976   bool HasRet = false;
977   for (const SDNode *U : Copy->users()) {
978     if (U->getOpcode() != X86ISD::RET_GLUE)
979       return false;
980     // If we are returning more than one value, we can definitely
981     // not make a tail call see PR19530
982     if (U->getNumOperands() > 4)
983       return false;
984     if (U->getNumOperands() == 4 &&
985         U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
986       return false;
987     HasRet = true;
988   }
989 
990   if (!HasRet)
991     return false;
992 
993   Chain = TCChain;
994   return true;
995 }
996 
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const997 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
998                                            ISD::NodeType ExtendKind) const {
999   MVT ReturnMVT = MVT::i32;
1000 
1001   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
1002   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
1003     // The ABI does not require i1, i8 or i16 to be extended.
1004     //
1005     // On Darwin, there is code in the wild relying on Clang's old behaviour of
1006     // always extending i8/i16 return values, so keep doing that for now.
1007     // (PR26665).
1008     ReturnMVT = MVT::i8;
1009   }
1010 
1011   EVT MinVT = getRegisterType(Context, ReturnMVT);
1012   return VT.bitsLT(MinVT) ? MinVT : VT;
1013 }
1014 
1015 /// Reads two 32 bit registers and creates a 64 bit mask value.
1016 /// \param VA The current 32 bit value that need to be assigned.
1017 /// \param NextVA The next 32 bit value that need to be assigned.
1018 /// \param Root The parent DAG node.
1019 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1020 ///                        glue purposes. In the case the DAG is already using
1021 ///                        physical register instead of virtual, we should glue
1022 ///                        our new SDValue to InGlue SDvalue.
1023 /// \return a new SDvalue of size 64bit.
getv64i1Argument(CCValAssign & VA,CCValAssign & NextVA,SDValue & Root,SelectionDAG & DAG,const SDLoc & DL,const X86Subtarget & Subtarget,SDValue * InGlue=nullptr)1024 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1025                                 SDValue &Root, SelectionDAG &DAG,
1026                                 const SDLoc &DL, const X86Subtarget &Subtarget,
1027                                 SDValue *InGlue = nullptr) {
1028   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1029   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1030   assert(VA.getValVT() == MVT::v64i1 &&
1031          "Expecting first location of 64 bit width type");
1032   assert(NextVA.getValVT() == VA.getValVT() &&
1033          "The locations should have the same type");
1034   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1035          "The values should reside in two registers");
1036 
1037   SDValue Lo, Hi;
1038   SDValue ArgValueLo, ArgValueHi;
1039 
1040   MachineFunction &MF = DAG.getMachineFunction();
1041   const TargetRegisterClass *RC = &X86::GR32RegClass;
1042 
1043   // Read a 32 bit value from the registers.
1044   if (nullptr == InGlue) {
1045     // When no physical register is present,
1046     // create an intermediate virtual register.
1047     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1048     ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1049     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
1050     ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1051   } else {
1052     // When a physical register is available read the value from it and glue
1053     // the reads together.
1054     ArgValueLo =
1055       DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue);
1056     *InGlue = ArgValueLo.getValue(2);
1057     ArgValueHi =
1058       DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue);
1059     *InGlue = ArgValueHi.getValue(2);
1060   }
1061 
1062   // Convert the i32 type into v32i1 type.
1063   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
1064 
1065   // Convert the i32 type into v32i1 type.
1066   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
1067 
1068   // Concatenate the two values together.
1069   return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi);
1070 }
1071 
1072 /// The function will lower a register of various sizes (8/16/32/64)
1073 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1074 /// \returns a DAG node contains the operand after lowering to mask type.
lowerRegToMasks(const SDValue & ValArg,const EVT & ValVT,const EVT & ValLoc,const SDLoc & DL,SelectionDAG & DAG)1075 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1076                                const EVT &ValLoc, const SDLoc &DL,
1077                                SelectionDAG &DAG) {
1078   SDValue ValReturned = ValArg;
1079 
1080   if (ValVT == MVT::v1i1)
1081     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned);
1082 
1083   if (ValVT == MVT::v64i1) {
1084     // In 32 bit machine, this case is handled by getv64i1Argument
1085     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1086     // In 64 bit machine, There is no need to truncate the value only bitcast
1087   } else {
1088     MVT MaskLenVT;
1089     switch (ValVT.getSimpleVT().SimpleTy) {
1090     case MVT::v8i1:
1091       MaskLenVT = MVT::i8;
1092       break;
1093     case MVT::v16i1:
1094       MaskLenVT = MVT::i16;
1095       break;
1096     case MVT::v32i1:
1097       MaskLenVT = MVT::i32;
1098       break;
1099     default:
1100       llvm_unreachable("Expecting a vector of i1 types");
1101     }
1102 
1103     ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned);
1104   }
1105   return DAG.getBitcast(ValVT, ValReturned);
1106 }
1107 
getPopFromX87Reg(SelectionDAG & DAG,SDValue Chain,const SDLoc & dl,Register Reg,EVT VT,SDValue Glue)1108 static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
1109                                 const SDLoc &dl, Register Reg, EVT VT,
1110                                 SDValue Glue) {
1111   SDVTList VTs = DAG.getVTList(VT, MVT::Other, MVT::Glue);
1112   SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
1113   return DAG.getNode(X86ISD::POP_FROM_X87_REG, dl, VTs,
1114                      ArrayRef(Ops, Glue.getNode() ? 3 : 2));
1115 }
1116 
1117 /// Lower the result values of a call into the
1118 /// appropriate copies out of appropriate physical registers.
1119 ///
LowerCallResult(SDValue Chain,SDValue InGlue,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals,uint32_t * RegMask) const1120 SDValue X86TargetLowering::LowerCallResult(
1121     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1122     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1123     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1124     uint32_t *RegMask) const {
1125 
1126   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1127   // Assign locations to each value returned by this call.
1128   SmallVector<CCValAssign, 16> RVLocs;
1129   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1130                  *DAG.getContext());
1131   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1132 
1133   // Copy all of the result registers out of their specified physreg.
1134   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
1135        ++I, ++InsIndex) {
1136     CCValAssign &VA = RVLocs[I];
1137     EVT CopyVT = VA.getLocVT();
1138 
1139     // In some calling conventions we need to remove the used registers
1140     // from the register mask.
1141     if (RegMask) {
1142       for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
1143         RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
1144     }
1145 
1146     // Report an error if there was an attempt to return FP values via XMM
1147     // registers.
1148     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
1149       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
1150       if (VA.getLocReg() == X86::XMM1)
1151         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1152       else
1153         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1154     } else if (!Subtarget.hasSSE2() &&
1155                X86::FR64XRegClass.contains(VA.getLocReg()) &&
1156                CopyVT == MVT::f64) {
1157       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
1158       if (VA.getLocReg() == X86::XMM1)
1159         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1160       else
1161         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1162     }
1163 
1164     // If we prefer to use the value in xmm registers, copy it out as f80 and
1165     // use a truncate to move it from fp stack reg to xmm reg.
1166     bool RoundAfterCopy = false;
1167     bool X87Result = VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1;
1168     if (X87Result && isScalarFPTypeInSSEReg(VA.getValVT())) {
1169       if (!Subtarget.hasX87())
1170         report_fatal_error("X87 register return with X87 disabled");
1171       CopyVT = MVT::f80;
1172       RoundAfterCopy = (CopyVT != VA.getLocVT());
1173     }
1174 
1175     SDValue Val;
1176     if (VA.needsCustom()) {
1177       assert(VA.getValVT() == MVT::v64i1 &&
1178              "Currently the only custom case is when we split v64i1 to 2 regs");
1179       Val =
1180           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
1181     } else {
1182       Chain =
1183           X87Result
1184               ? getPopFromX87Reg(DAG, Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1185                     .getValue(1)
1186               : DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1187                     .getValue(1);
1188       Val = Chain.getValue(0);
1189       InGlue = Chain.getValue(2);
1190     }
1191 
1192     if (RoundAfterCopy)
1193       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1194                         // This truncation won't change the value.
1195                         DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
1196 
1197     if (VA.isExtInLoc()) {
1198       if (VA.getValVT().isVector() &&
1199           VA.getValVT().getScalarType() == MVT::i1 &&
1200           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1201            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1202         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1203         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
1204       } else
1205         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1206     }
1207 
1208     if (VA.getLocInfo() == CCValAssign::BCvt)
1209       Val = DAG.getBitcast(VA.getValVT(), Val);
1210 
1211     InVals.push_back(Val);
1212   }
1213 
1214   return Chain;
1215 }
1216 
1217 //===----------------------------------------------------------------------===//
1218 //                C & StdCall & Fast Calling Convention implementation
1219 //===----------------------------------------------------------------------===//
1220 //  StdCall calling convention seems to be standard for many Windows' API
1221 //  routines and around. It differs from C calling convention just a little:
1222 //  callee should clean up the stack, not caller. Symbols should be also
1223 //  decorated in some fancy way :) It doesn't support any vector arguments.
1224 //  For info on fast calling convention see Fast Calling Convention (tail call)
1225 //  implementation LowerX86_32FastCCCallTo.
1226 
1227 /// Determines whether Args, either a set of outgoing arguments to a call, or a
1228 /// set of incoming args of a call, contains an sret pointer that the callee
1229 /// pops
1230 template <typename T>
hasCalleePopSRet(const SmallVectorImpl<T> & Args,const X86Subtarget & Subtarget)1231 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1232                              const X86Subtarget &Subtarget) {
1233   // Not C++20 (yet), so no concepts available.
1234   static_assert(std::is_same_v<T, ISD::OutputArg> ||
1235                     std::is_same_v<T, ISD::InputArg>,
1236                 "requires ISD::OutputArg or ISD::InputArg");
1237 
1238   // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
1239   // for most compilations.
1240   if (!Subtarget.is32Bit())
1241     return false;
1242 
1243   if (Args.empty())
1244     return false;
1245 
1246   // Most calls do not have an sret argument, check the arg next.
1247   const ISD::ArgFlagsTy &Flags = Args[0].Flags;
1248   if (!Flags.isSRet() || Flags.isInReg())
1249     return false;
1250 
1251   // The MSVCabi does not pop the sret.
1252   if (Subtarget.getTargetTriple().isOSMSVCRT())
1253     return false;
1254 
1255   // MCUs don't pop the sret
1256   if (Subtarget.isTargetMCU())
1257     return false;
1258 
1259   // Callee pops argument
1260   return true;
1261 }
1262 
1263 /// Make a copy of an aggregate at address specified by "Src" to address
1264 /// "Dst" with size and alignment information specified by the specific
1265 /// parameter attribute. The copy will be passed as a byval function parameter.
CreateCopyOfByValArgument(SDValue Src,SDValue Dst,SDValue Chain,ISD::ArgFlagsTy Flags,SelectionDAG & DAG,const SDLoc & dl)1266 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1267                                          SDValue Chain, ISD::ArgFlagsTy Flags,
1268                                          SelectionDAG &DAG, const SDLoc &dl) {
1269   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
1270 
1271   return DAG.getMemcpy(
1272       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
1273       /*isVolatile*/ false, /*AlwaysInline=*/true,
1274       /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo());
1275 }
1276 
1277 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)1278 static bool canGuaranteeTCO(CallingConv::ID CC) {
1279   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1280           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
1281           CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
1282 }
1283 
1284 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)1285 static bool mayTailCallThisCC(CallingConv::ID CC) {
1286   switch (CC) {
1287   // C calling conventions:
1288   case CallingConv::C:
1289   case CallingConv::Win64:
1290   case CallingConv::X86_64_SysV:
1291   case CallingConv::PreserveNone:
1292   // Callee pop conventions:
1293   case CallingConv::X86_ThisCall:
1294   case CallingConv::X86_StdCall:
1295   case CallingConv::X86_VectorCall:
1296   case CallingConv::X86_FastCall:
1297   // Swift:
1298   case CallingConv::Swift:
1299     return true;
1300   default:
1301     return canGuaranteeTCO(CC);
1302   }
1303 }
1304 
1305 /// Return true if the function is being made into a tailcall target by
1306 /// changing its ABI.
shouldGuaranteeTCO(CallingConv::ID CC,bool GuaranteedTailCallOpt)1307 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1308   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
1309          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
1310 }
1311 
mayBeEmittedAsTailCall(const CallInst * CI) const1312 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1313   if (!CI->isTailCall())
1314     return false;
1315 
1316   CallingConv::ID CalleeCC = CI->getCallingConv();
1317   if (!mayTailCallThisCC(CalleeCC))
1318     return false;
1319 
1320   return true;
1321 }
1322 
1323 SDValue
LowerMemArgument(SDValue Chain,CallingConv::ID CallConv,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,MachineFrameInfo & MFI,unsigned i) const1324 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1325                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1326                                     const SDLoc &dl, SelectionDAG &DAG,
1327                                     const CCValAssign &VA,
1328                                     MachineFrameInfo &MFI, unsigned i) const {
1329   // Create the nodes corresponding to a load from this parameter slot.
1330   ISD::ArgFlagsTy Flags = Ins[i].Flags;
1331   bool AlwaysUseMutable = shouldGuaranteeTCO(
1332       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
1333   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1334   EVT ValVT;
1335   MVT PtrVT = getPointerTy(DAG.getDataLayout());
1336 
1337   // If value is passed by pointer we have address passed instead of the value
1338   // itself. No need to extend if the mask value and location share the same
1339   // absolute size.
1340   bool ExtendedInMem =
1341       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1342       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1343 
1344   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
1345     ValVT = VA.getLocVT();
1346   else
1347     ValVT = VA.getValVT();
1348 
1349   // FIXME: For now, all byval parameter objects are marked mutable. This can be
1350   // changed with more analysis.
1351   // In case of tail call optimization mark all arguments mutable. Since they
1352   // could be overwritten by lowering of arguments in case of a tail call.
1353   if (Flags.isByVal()) {
1354     unsigned Bytes = Flags.getByValSize();
1355     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1356 
1357     // FIXME: For now, all byval parameter objects are marked as aliasing. This
1358     // can be improved with deeper analysis.
1359     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
1360                                    /*isAliased=*/true);
1361     return DAG.getFrameIndex(FI, PtrVT);
1362   }
1363 
1364   EVT ArgVT = Ins[i].ArgVT;
1365 
1366   // If this is a vector that has been split into multiple parts, don't elide
1367   // the copy. The layout on the stack may not match the packed in-memory
1368   // layout.
1369   bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1370 
1371   // This is an argument in memory. We might be able to perform copy elision.
1372   // If the argument is passed directly in memory without any extension, then we
1373   // can perform copy elision. Large vector types, for example, may be passed
1374   // indirectly by pointer.
1375   if (Flags.isCopyElisionCandidate() &&
1376       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1377       !ScalarizedVector) {
1378     SDValue PartAddr;
1379     if (Ins[i].PartOffset == 0) {
1380       // If this is a one-part value or the first part of a multi-part value,
1381       // create a stack object for the entire argument value type and return a
1382       // load from our portion of it. This assumes that if the first part of an
1383       // argument is in memory, the rest will also be in memory.
1384       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
1385                                      /*IsImmutable=*/false);
1386       PartAddr = DAG.getFrameIndex(FI, PtrVT);
1387       return DAG.getLoad(
1388           ValVT, dl, Chain, PartAddr,
1389           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
1390     }
1391 
1392     // This is not the first piece of an argument in memory. See if there is
1393     // already a fixed stack object including this offset. If so, assume it
1394     // was created by the PartOffset == 0 branch above and create a load from
1395     // the appropriate offset into it.
1396     int64_t PartBegin = VA.getLocMemOffset();
1397     int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
1398     int FI = MFI.getObjectIndexBegin();
1399     for (; MFI.isFixedObjectIndex(FI); ++FI) {
1400       int64_t ObjBegin = MFI.getObjectOffset(FI);
1401       int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
1402       if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1403         break;
1404     }
1405     if (MFI.isFixedObjectIndex(FI)) {
1406       SDValue Addr =
1407           DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
1408                       DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
1409       return DAG.getLoad(ValVT, dl, Chain, Addr,
1410                          MachinePointerInfo::getFixedStack(
1411                              DAG.getMachineFunction(), FI, Ins[i].PartOffset));
1412     }
1413   }
1414 
1415   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
1416                                  VA.getLocMemOffset(), isImmutable);
1417 
1418   // Set SExt or ZExt flag.
1419   if (VA.getLocInfo() == CCValAssign::ZExt) {
1420     MFI.setObjectZExt(FI, true);
1421   } else if (VA.getLocInfo() == CCValAssign::SExt) {
1422     MFI.setObjectSExt(FI, true);
1423   }
1424 
1425   MaybeAlign Alignment;
1426   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1427       ValVT != MVT::f80)
1428     Alignment = MaybeAlign(4);
1429   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1430   SDValue Val = DAG.getLoad(
1431       ValVT, dl, Chain, FIN,
1432       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1433       Alignment);
1434   return ExtendedInMem
1435              ? (VA.getValVT().isVector()
1436                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
1437                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
1438              : Val;
1439 }
1440 
1441 // FIXME: Get this from tablegen.
get64BitArgumentGPRs(CallingConv::ID CallConv,const X86Subtarget & Subtarget)1442 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1443                                                 const X86Subtarget &Subtarget) {
1444   assert(Subtarget.is64Bit());
1445 
1446   if (Subtarget.isCallingConvWin64(CallConv)) {
1447     static const MCPhysReg GPR64ArgRegsWin64[] = {
1448       X86::RCX, X86::RDX, X86::R8,  X86::R9
1449     };
1450     return GPR64ArgRegsWin64;
1451   }
1452 
1453   static const MCPhysReg GPR64ArgRegs64Bit[] = {
1454     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1455   };
1456   return GPR64ArgRegs64Bit;
1457 }
1458 
1459 // FIXME: Get this from tablegen.
get64BitArgumentXMMs(MachineFunction & MF,CallingConv::ID CallConv,const X86Subtarget & Subtarget)1460 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1461                                                 CallingConv::ID CallConv,
1462                                                 const X86Subtarget &Subtarget) {
1463   assert(Subtarget.is64Bit());
1464   if (Subtarget.isCallingConvWin64(CallConv)) {
1465     // The XMM registers which might contain var arg parameters are shadowed
1466     // in their paired GPR.  So we only need to save the GPR to their home
1467     // slots.
1468     // TODO: __vectorcall will change this.
1469     return {};
1470   }
1471 
1472   bool isSoftFloat = Subtarget.useSoftFloat();
1473   if (isSoftFloat || !Subtarget.hasSSE1())
1474     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
1475     // registers.
1476     return {};
1477 
1478   static const MCPhysReg XMMArgRegs64Bit[] = {
1479     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1480     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1481   };
1482   return XMMArgRegs64Bit;
1483 }
1484 
1485 #ifndef NDEBUG
isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs)1486 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1487   return llvm::is_sorted(
1488       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1489         return A.getValNo() < B.getValNo();
1490       });
1491 }
1492 #endif
1493 
1494 namespace {
1495 /// This is a helper class for lowering variable arguments parameters.
1496 class VarArgsLoweringHelper {
1497 public:
VarArgsLoweringHelper(X86MachineFunctionInfo * FuncInfo,const SDLoc & Loc,SelectionDAG & DAG,const X86Subtarget & Subtarget,CallingConv::ID CallConv,CCState & CCInfo)1498   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
1499                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
1500                         CallingConv::ID CallConv, CCState &CCInfo)
1501       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1502         TheMachineFunction(DAG.getMachineFunction()),
1503         TheFunction(TheMachineFunction.getFunction()),
1504         FrameInfo(TheMachineFunction.getFrameInfo()),
1505         FrameLowering(*Subtarget.getFrameLowering()),
1506         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1507         CCInfo(CCInfo) {}
1508 
1509   // Lower variable arguments parameters.
1510   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1511 
1512 private:
1513   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1514 
1515   void forwardMustTailParameters(SDValue &Chain);
1516 
is64Bit() const1517   bool is64Bit() const { return Subtarget.is64Bit(); }
isWin64() const1518   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
1519 
1520   X86MachineFunctionInfo *FuncInfo;
1521   const SDLoc &DL;
1522   SelectionDAG &DAG;
1523   const X86Subtarget &Subtarget;
1524   MachineFunction &TheMachineFunction;
1525   const Function &TheFunction;
1526   MachineFrameInfo &FrameInfo;
1527   const TargetFrameLowering &FrameLowering;
1528   const TargetLowering &TargLowering;
1529   CallingConv::ID CallConv;
1530   CCState &CCInfo;
1531 };
1532 } // namespace
1533 
createVarArgAreaAndStoreRegisters(SDValue & Chain,unsigned StackSize)1534 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1535     SDValue &Chain, unsigned StackSize) {
1536   // If the function takes variable number of arguments, make a frame index for
1537   // the start of the first vararg value... for expansion of llvm.va_start. We
1538   // can skip this if there are no va_start calls.
1539   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
1540                     CallConv != CallingConv::X86_ThisCall)) {
1541     FuncInfo->setVarArgsFrameIndex(
1542         FrameInfo.CreateFixedObject(1, StackSize, true));
1543   }
1544 
1545   // 64-bit calling conventions support varargs and register parameters, so we
1546   // have to do extra work to spill them in the prologue.
1547   if (is64Bit()) {
1548     // Find the first unallocated argument registers.
1549     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1550     ArrayRef<MCPhysReg> ArgXMMs =
1551         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
1552     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
1553     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
1554 
1555     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1556            "SSE register cannot be used when SSE is disabled!");
1557 
1558     if (isWin64()) {
1559       // Get to the caller-allocated home save location.  Add 8 to account
1560       // for the return address.
1561       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
1562       FuncInfo->setRegSaveFrameIndex(
1563           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
1564       // Fixup to set vararg frame on shadow area (4 x i64).
1565       if (NumIntRegs < 4)
1566         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1567     } else {
1568       // For X86-64, if there are vararg parameters that are passed via
1569       // registers, then we must store them to their spots on the stack so
1570       // they may be loaded by dereferencing the result of va_next.
1571       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1572       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
1573       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1574           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
1575     }
1576 
1577     SmallVector<SDValue, 6>
1578         LiveGPRs; // list of SDValue for GPR registers keeping live input value
1579     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
1580                                          // keeping live input value
1581     SDValue ALVal; // if applicable keeps SDValue for %al register
1582 
1583     // Gather all the live in physical registers.
1584     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
1585       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
1586       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
1587     }
1588     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
1589     if (!AvailableXmms.empty()) {
1590       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1591       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
1592       for (MCPhysReg Reg : AvailableXmms) {
1593         // FastRegisterAllocator spills virtual registers at basic
1594         // block boundary. That leads to usages of xmm registers
1595         // outside of check for %al. Pass physical registers to
1596         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1597         TheMachineFunction.getRegInfo().addLiveIn(Reg);
1598         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
1599       }
1600     }
1601 
1602     // Store the integer parameter registers.
1603     SmallVector<SDValue, 8> MemOps;
1604     SDValue RSFIN =
1605         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1606                           TargLowering.getPointerTy(DAG.getDataLayout()));
1607     unsigned Offset = FuncInfo->getVarArgsGPOffset();
1608     for (SDValue Val : LiveGPRs) {
1609       SDValue FIN = DAG.getNode(ISD::ADD, DL,
1610                                 TargLowering.getPointerTy(DAG.getDataLayout()),
1611                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
1612       SDValue Store =
1613           DAG.getStore(Val.getValue(1), DL, Val, FIN,
1614                        MachinePointerInfo::getFixedStack(
1615                            DAG.getMachineFunction(),
1616                            FuncInfo->getRegSaveFrameIndex(), Offset));
1617       MemOps.push_back(Store);
1618       Offset += 8;
1619     }
1620 
1621     // Now store the XMM (fp + vector) parameter registers.
1622     if (!LiveXMMRegs.empty()) {
1623       SmallVector<SDValue, 12> SaveXMMOps;
1624       SaveXMMOps.push_back(Chain);
1625       SaveXMMOps.push_back(ALVal);
1626       SaveXMMOps.push_back(RSFIN);
1627       SaveXMMOps.push_back(
1628           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
1629       llvm::append_range(SaveXMMOps, LiveXMMRegs);
1630       MachineMemOperand *StoreMMO =
1631           DAG.getMachineFunction().getMachineMemOperand(
1632               MachinePointerInfo::getFixedStack(
1633                   DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
1634                   Offset),
1635               MachineMemOperand::MOStore, 128, Align(16));
1636       MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
1637                                                DL, DAG.getVTList(MVT::Other),
1638                                                SaveXMMOps, MVT::i8, StoreMMO));
1639     }
1640 
1641     if (!MemOps.empty())
1642       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
1643   }
1644 }
1645 
forwardMustTailParameters(SDValue & Chain)1646 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1647   // Find the largest legal vector type.
1648   MVT VecVT = MVT::Other;
1649   // FIXME: Only some x86_32 calling conventions support AVX512.
1650   if (Subtarget.useAVX512Regs() &&
1651       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
1652                      CallConv == CallingConv::Intel_OCL_BI)))
1653     VecVT = MVT::v16f32;
1654   else if (Subtarget.hasAVX())
1655     VecVT = MVT::v8f32;
1656   else if (Subtarget.hasSSE2())
1657     VecVT = MVT::v4f32;
1658 
1659   // We forward some GPRs and some vector types.
1660   SmallVector<MVT, 2> RegParmTypes;
1661   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1662   RegParmTypes.push_back(IntVT);
1663   if (VecVT != MVT::Other)
1664     RegParmTypes.push_back(VecVT);
1665 
1666   // Compute the set of forwarded registers. The rest are scratch.
1667   SmallVectorImpl<ForwardedRegister> &Forwards =
1668       FuncInfo->getForwardedMustTailRegParms();
1669   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
1670 
1671   // Forward AL for SysV x86_64 targets, since it is used for varargs.
1672   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
1673     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1674     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
1675   }
1676 
1677   // Copy all forwards from physical to virtual registers.
1678   for (ForwardedRegister &FR : Forwards) {
1679     // FIXME: Can we use a less constrained schedule?
1680     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
1681     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1682         TargLowering.getRegClassFor(FR.VT));
1683     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
1684   }
1685 }
1686 
lowerVarArgsParameters(SDValue & Chain,unsigned StackSize)1687 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1688                                                    unsigned StackSize) {
1689   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1690   // If necessary, it would be set into the correct value later.
1691   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1692   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1693 
1694   if (FrameInfo.hasVAStart())
1695     createVarArgAreaAndStoreRegisters(Chain, StackSize);
1696 
1697   if (FrameInfo.hasMustTailInVarArgFunc())
1698     forwardMustTailParameters(Chain);
1699 }
1700 
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool IsVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const1701 SDValue X86TargetLowering::LowerFormalArguments(
1702     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1703     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1704     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1705   MachineFunction &MF = DAG.getMachineFunction();
1706   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1707 
1708   const Function &F = MF.getFunction();
1709   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1710       F.getName() == "main")
1711     FuncInfo->setForceFramePointer(true);
1712 
1713   MachineFrameInfo &MFI = MF.getFrameInfo();
1714   bool Is64Bit = Subtarget.is64Bit();
1715   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
1716 
1717   assert(
1718       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
1719       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1720 
1721   // Assign locations to all of the incoming arguments.
1722   SmallVector<CCValAssign, 16> ArgLocs;
1723   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1724 
1725   // Allocate shadow area for Win64.
1726   if (IsWin64)
1727     CCInfo.AllocateStack(32, Align(8));
1728 
1729   CCInfo.AnalyzeArguments(Ins, CC_X86);
1730 
1731   // In vectorcall calling convention a second pass is required for the HVA
1732   // types.
1733   if (CallingConv::X86_VectorCall == CallConv) {
1734     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
1735   }
1736 
1737   // The next loop assumes that the locations are in the same order of the
1738   // input arguments.
1739   assert(isSortedByValueNo(ArgLocs) &&
1740          "Argument Location list must be sorted before lowering");
1741 
1742   SDValue ArgValue;
1743   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
1744        ++I, ++InsIndex) {
1745     assert(InsIndex < Ins.size() && "Invalid Ins index");
1746     CCValAssign &VA = ArgLocs[I];
1747 
1748     if (VA.isRegLoc()) {
1749       EVT RegVT = VA.getLocVT();
1750       if (VA.needsCustom()) {
1751         assert(
1752             VA.getValVT() == MVT::v64i1 &&
1753             "Currently the only custom case is when we split v64i1 to 2 regs");
1754 
1755         // v64i1 values, in regcall calling convention, that are
1756         // compiled to 32 bit arch, are split up into two registers.
1757         ArgValue =
1758             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
1759       } else {
1760         const TargetRegisterClass *RC;
1761         if (RegVT == MVT::i8)
1762           RC = &X86::GR8RegClass;
1763         else if (RegVT == MVT::i16)
1764           RC = &X86::GR16RegClass;
1765         else if (RegVT == MVT::i32)
1766           RC = &X86::GR32RegClass;
1767         else if (Is64Bit && RegVT == MVT::i64)
1768           RC = &X86::GR64RegClass;
1769         else if (RegVT == MVT::f16)
1770           RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1771         else if (RegVT == MVT::f32)
1772           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1773         else if (RegVT == MVT::f64)
1774           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1775         else if (RegVT == MVT::f80)
1776           RC = &X86::RFP80RegClass;
1777         else if (RegVT == MVT::f128)
1778           RC = &X86::VR128RegClass;
1779         else if (RegVT.is512BitVector())
1780           RC = &X86::VR512RegClass;
1781         else if (RegVT.is256BitVector())
1782           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1783         else if (RegVT.is128BitVector())
1784           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1785         else if (RegVT == MVT::x86mmx)
1786           RC = &X86::VR64RegClass;
1787         else if (RegVT == MVT::v1i1)
1788           RC = &X86::VK1RegClass;
1789         else if (RegVT == MVT::v8i1)
1790           RC = &X86::VK8RegClass;
1791         else if (RegVT == MVT::v16i1)
1792           RC = &X86::VK16RegClass;
1793         else if (RegVT == MVT::v32i1)
1794           RC = &X86::VK32RegClass;
1795         else if (RegVT == MVT::v64i1)
1796           RC = &X86::VK64RegClass;
1797         else
1798           llvm_unreachable("Unknown argument type!");
1799 
1800         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1801         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1802       }
1803 
1804       // If this is an 8 or 16-bit value, it is really passed promoted to 32
1805       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1806       // right size.
1807       if (VA.getLocInfo() == CCValAssign::SExt)
1808         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1809                                DAG.getValueType(VA.getValVT()));
1810       else if (VA.getLocInfo() == CCValAssign::ZExt)
1811         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1812                                DAG.getValueType(VA.getValVT()));
1813       else if (VA.getLocInfo() == CCValAssign::BCvt)
1814         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
1815 
1816       if (VA.isExtInLoc()) {
1817         // Handle MMX values passed in XMM regs.
1818         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1819           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
1820         else if (VA.getValVT().isVector() &&
1821                  VA.getValVT().getScalarType() == MVT::i1 &&
1822                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1823                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1824           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1825           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
1826         } else
1827           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1828       }
1829     } else {
1830       assert(VA.isMemLoc());
1831       ArgValue =
1832           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
1833     }
1834 
1835     // If value is passed via pointer - do a load.
1836     if (VA.getLocInfo() == CCValAssign::Indirect &&
1837         !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
1838       ArgValue =
1839           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
1840     }
1841 
1842     InVals.push_back(ArgValue);
1843   }
1844 
1845   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
1846     if (Ins[I].Flags.isSwiftAsync()) {
1847       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1848       if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF))
1849         X86FI->setHasSwiftAsyncContext(true);
1850       else {
1851         int PtrSize = Subtarget.is64Bit() ? 8 : 4;
1852         int FI =
1853             MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize), false);
1854         X86FI->setSwiftAsyncContextFrameIdx(FI);
1855         SDValue St = DAG.getStore(
1856             DAG.getEntryNode(), dl, InVals[I],
1857             DAG.getFrameIndex(FI, PtrSize == 8 ? MVT::i64 : MVT::i32),
1858             MachinePointerInfo::getFixedStack(MF, FI));
1859         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
1860       }
1861     }
1862 
1863     // Swift calling convention does not require we copy the sret argument
1864     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1865     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
1866       continue;
1867 
1868     // All x86 ABIs require that for returning structs by value we copy the
1869     // sret argument into %rax/%eax (depending on ABI) for the return. Save
1870     // the argument into a virtual register so that we can access it from the
1871     // return points.
1872     if (Ins[I].Flags.isSRet()) {
1873       assert(!FuncInfo->getSRetReturnReg() &&
1874              "SRet return has already been set");
1875       MVT PtrTy = getPointerTy(DAG.getDataLayout());
1876       Register Reg =
1877           MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
1878       FuncInfo->setSRetReturnReg(Reg);
1879       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
1880       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1881       break;
1882     }
1883   }
1884 
1885   unsigned StackSize = CCInfo.getStackSize();
1886   // Align stack specially for tail calls.
1887   if (shouldGuaranteeTCO(CallConv,
1888                          MF.getTarget().Options.GuaranteedTailCallOpt))
1889     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1890 
1891   if (IsVarArg)
1892     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1893         .lowerVarArgsParameters(Chain, StackSize);
1894 
1895   // Some CCs need callee pop.
1896   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
1897                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
1898     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1899   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
1900     // X86 interrupts must pop the error code (and the alignment padding) if
1901     // present.
1902     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
1903   } else {
1904     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1905     // If this is an sret function, the return should pop the hidden pointer.
1906     if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
1907       FuncInfo->setBytesToPopOnReturn(4);
1908   }
1909 
1910   if (!Is64Bit) {
1911     // RegSaveFrameIndex is X86-64 only.
1912     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1913   }
1914 
1915   FuncInfo->setArgumentStackSize(StackSize);
1916 
1917   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1918     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
1919     if (Personality == EHPersonality::CoreCLR) {
1920       assert(Is64Bit);
1921       // TODO: Add a mechanism to frame lowering that will allow us to indicate
1922       // that we'd prefer this slot be allocated towards the bottom of the frame
1923       // (i.e. near the stack pointer after allocating the frame).  Every
1924       // funclet needs a copy of this slot in its (mostly empty) frame, and the
1925       // offset from the bottom of this and each funclet's frame must be the
1926       // same, so the size of funclets' (mostly empty) frames is dictated by
1927       // how far this slot is from the bottom (since they allocate just enough
1928       // space to accommodate holding this slot at the correct offset).
1929       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
1930       EHInfo->PSPSymFrameIdx = PSPSymFI;
1931     }
1932   }
1933 
1934   if (shouldDisableArgRegFromCSR(CallConv) ||
1935       F.hasFnAttribute("no_caller_saved_registers")) {
1936     MachineRegisterInfo &MRI = MF.getRegInfo();
1937     for (std::pair<MCRegister, Register> Pair : MRI.liveins())
1938       MRI.disableCalleeSavedRegister(Pair.first);
1939   }
1940 
1941   if (CallingConv::PreserveNone == CallConv)
1942     for (const ISD::InputArg &In : Ins) {
1943       if (In.Flags.isSwiftSelf() || In.Flags.isSwiftAsync() ||
1944           In.Flags.isSwiftError()) {
1945         errorUnsupported(DAG, dl,
1946                          "Swift attributes can't be used with preserve_none");
1947         break;
1948       }
1949     }
1950 
1951   return Chain;
1952 }
1953 
LowerMemOpCallTo(SDValue Chain,SDValue StackPtr,SDValue Arg,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,ISD::ArgFlagsTy Flags,bool isByVal) const1954 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1955                                             SDValue Arg, const SDLoc &dl,
1956                                             SelectionDAG &DAG,
1957                                             const CCValAssign &VA,
1958                                             ISD::ArgFlagsTy Flags,
1959                                             bool isByVal) const {
1960   unsigned LocMemOffset = VA.getLocMemOffset();
1961   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1962   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1963                        StackPtr, PtrOff);
1964   if (isByVal)
1965     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1966 
1967   MaybeAlign Alignment;
1968   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1969       Arg.getSimpleValueType() != MVT::f80)
1970     Alignment = MaybeAlign(4);
1971   return DAG.getStore(
1972       Chain, dl, Arg, PtrOff,
1973       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
1974       Alignment);
1975 }
1976 
1977 /// Emit a load of return address if tail call
1978 /// optimization is performed and it is required.
EmitTailCallLoadRetAddr(SelectionDAG & DAG,SDValue & OutRetAddr,SDValue Chain,bool IsTailCall,bool Is64Bit,int FPDiff,const SDLoc & dl) const1979 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1980     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1981     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1982   // Adjust the Return address stack slot.
1983   EVT VT = getPointerTy(DAG.getDataLayout());
1984   OutRetAddr = getReturnAddressFrameIndex(DAG);
1985 
1986   // Load the "old" Return address.
1987   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
1988   return SDValue(OutRetAddr.getNode(), 1);
1989 }
1990 
1991 /// Emit a store of the return address if tail call
1992 /// optimization is performed and it is required (FPDiff!=0).
EmitTailCallStoreRetAddr(SelectionDAG & DAG,MachineFunction & MF,SDValue Chain,SDValue RetAddrFrIdx,EVT PtrVT,unsigned SlotSize,int FPDiff,const SDLoc & dl)1993 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1994                                         SDValue Chain, SDValue RetAddrFrIdx,
1995                                         EVT PtrVT, unsigned SlotSize,
1996                                         int FPDiff, const SDLoc &dl) {
1997   // Store the return address to the appropriate stack slot.
1998   if (!FPDiff) return Chain;
1999   // Calculate the new stack slot for the return address.
2000   int NewReturnAddrFI =
2001     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2002                                          false);
2003   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2004   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2005                        MachinePointerInfo::getFixedStack(
2006                            DAG.getMachineFunction(), NewReturnAddrFI));
2007   return Chain;
2008 }
2009 
2010 /// Returns a vector_shuffle mask for an movs{s|d}, movd
2011 /// operation of specified width.
getMOVL(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2) const2012 SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
2013                                    SDValue V1, SDValue V2) const {
2014   unsigned NumElems = VT.getVectorNumElements();
2015   SmallVector<int, 8> Mask;
2016   Mask.push_back(NumElems);
2017   for (unsigned i = 1; i != NumElems; ++i)
2018     Mask.push_back(i);
2019   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
2020 }
2021 
2022 SDValue
LowerCall(TargetLowering::CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const2023 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2024                              SmallVectorImpl<SDValue> &InVals) const {
2025   SelectionDAG &DAG                     = CLI.DAG;
2026   SDLoc &dl                             = CLI.DL;
2027   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2028   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2029   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2030   SDValue Chain                         = CLI.Chain;
2031   SDValue Callee                        = CLI.Callee;
2032   CallingConv::ID CallConv              = CLI.CallConv;
2033   bool &isTailCall                      = CLI.IsTailCall;
2034   bool isVarArg                         = CLI.IsVarArg;
2035   const auto *CB                        = CLI.CB;
2036 
2037   MachineFunction &MF = DAG.getMachineFunction();
2038   bool Is64Bit        = Subtarget.is64Bit();
2039   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
2040   bool IsSibcall      = false;
2041   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
2042       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
2043   bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
2044   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2045   bool HasNCSR = (CB && isa<CallInst>(CB) &&
2046                   CB->hasFnAttr("no_caller_saved_registers"));
2047   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
2048   bool IsCFICall = IsIndirectCall && CLI.CFIType;
2049   const Module *M = MF.getFunction().getParent();
2050 
2051   // If the indirect call target has the nocf_check attribute, the call needs
2052   // the NOTRACK prefix. For simplicity just disable tail calls as there are
2053   // so many variants.
2054   bool IsNoTrackIndirectCall = IsIndirectCall && CB->doesNoCfCheck() &&
2055                                M->getModuleFlag("cf-protection-branch");
2056   if (IsNoTrackIndirectCall)
2057     isTailCall = false;
2058 
2059   MachineFunction::CallSiteInfo CSInfo;
2060   if (CallConv == CallingConv::X86_INTR)
2061     report_fatal_error("X86 interrupts may not be called directly");
2062 
2063   if (IsIndirectCall && !IsWin64 &&
2064       M->getModuleFlag("import-call-optimization"))
2065     errorUnsupported(DAG, dl,
2066                      "Indirect calls must have a normal calling convention if "
2067                      "Import Call Optimization is enabled");
2068 
2069   // Analyze operands of the call, assigning locations to each operand.
2070   SmallVector<CCValAssign, 16> ArgLocs;
2071   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2072 
2073   // Allocate shadow area for Win64.
2074   if (IsWin64)
2075     CCInfo.AllocateStack(32, Align(8));
2076 
2077   CCInfo.AnalyzeArguments(Outs, CC_X86);
2078 
2079   // In vectorcall calling convention a second pass is required for the HVA
2080   // types.
2081   if (CallingConv::X86_VectorCall == CallConv) {
2082     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
2083   }
2084 
2085   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2086   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
2087     // If we are using a GOT, disable tail calls to external symbols with
2088     // default visibility. Tail calling such a symbol requires using a GOT
2089     // relocation, which forces early binding of the symbol. This breaks code
2090     // that require lazy function symbol resolution. Using musttail or
2091     // GuaranteedTailCallOpt will override this.
2092     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2093     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2094                G->getGlobal()->hasDefaultVisibility()))
2095       isTailCall = false;
2096   }
2097 
2098   if (isTailCall && !IsMustTail) {
2099     // Check if it's really possible to do a tail call.
2100     isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
2101                                                    IsCalleePopSRet);
2102 
2103     // Sibcalls are automatically detected tailcalls which do not require
2104     // ABI changes.
2105     if (!IsGuaranteeTCO && isTailCall)
2106       IsSibcall = true;
2107 
2108     if (isTailCall)
2109       ++NumTailCalls;
2110   }
2111 
2112   if (IsMustTail && !isTailCall)
2113     report_fatal_error("failed to perform tail call elimination on a call "
2114                        "site marked musttail");
2115 
2116   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2117          "Var args not supported with calling convention fastcc, ghc or hipe");
2118 
2119   // Get a count of how many bytes are to be pushed on the stack.
2120   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2121   if (IsSibcall)
2122     // This is a sibcall. The memory operands are available in caller's
2123     // own caller's stack.
2124     NumBytes = 0;
2125   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
2126     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2127 
2128   int FPDiff = 0;
2129   if (isTailCall &&
2130       shouldGuaranteeTCO(CallConv,
2131                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
2132     // Lower arguments at fp - stackoffset + fpdiff.
2133     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2134 
2135     FPDiff = NumBytesCallerPushed - NumBytes;
2136 
2137     // Set the delta of movement of the returnaddr stackslot.
2138     // But only set if delta is greater than previous delta.
2139     if (FPDiff < X86Info->getTCReturnAddrDelta())
2140       X86Info->setTCReturnAddrDelta(FPDiff);
2141   }
2142 
2143   unsigned NumBytesToPush = NumBytes;
2144   unsigned NumBytesToPop = NumBytes;
2145 
2146   // If we have an inalloca argument, all stack space has already been allocated
2147   // for us and be right at the top of the stack.  We don't support multiple
2148   // arguments passed in memory when using inalloca.
2149   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2150     NumBytesToPush = 0;
2151     if (!ArgLocs.back().isMemLoc())
2152       report_fatal_error("cannot use inalloca attribute on a register "
2153                          "parameter");
2154     if (ArgLocs.back().getLocMemOffset() != 0)
2155       report_fatal_error("any parameter with the inalloca attribute must be "
2156                          "the only memory argument");
2157   } else if (CLI.IsPreallocated) {
2158     assert(ArgLocs.back().isMemLoc() &&
2159            "cannot use preallocated attribute on a register "
2160            "parameter");
2161     SmallVector<size_t, 4> PreallocatedOffsets;
2162     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
2163       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
2164         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
2165       }
2166     }
2167     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2168     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
2169     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
2170     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
2171     NumBytesToPush = 0;
2172   }
2173 
2174   if (!IsSibcall && !IsMustTail)
2175     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
2176                                  NumBytes - NumBytesToPush, dl);
2177 
2178   SDValue RetAddrFrIdx;
2179   // Load return address for tail calls.
2180   if (isTailCall && FPDiff)
2181     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2182                                     Is64Bit, FPDiff, dl);
2183 
2184   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
2185   SmallVector<SDValue, 8> MemOpChains;
2186   SDValue StackPtr;
2187 
2188   // The next loop assumes that the locations are in the same order of the
2189   // input arguments.
2190   assert(isSortedByValueNo(ArgLocs) &&
2191          "Argument Location list must be sorted before lowering");
2192 
2193   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2194   // of tail call optimization arguments are handle later.
2195   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2196   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
2197        ++I, ++OutIndex) {
2198     assert(OutIndex < Outs.size() && "Invalid Out index");
2199     // Skip inalloca/preallocated arguments, they have already been written.
2200     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
2201     if (Flags.isInAlloca() || Flags.isPreallocated())
2202       continue;
2203 
2204     CCValAssign &VA = ArgLocs[I];
2205     EVT RegVT = VA.getLocVT();
2206     SDValue Arg = OutVals[OutIndex];
2207     bool isByVal = Flags.isByVal();
2208 
2209     // Promote the value if needed.
2210     switch (VA.getLocInfo()) {
2211     default: llvm_unreachable("Unknown loc info!");
2212     case CCValAssign::Full: break;
2213     case CCValAssign::SExt:
2214       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2215       break;
2216     case CCValAssign::ZExt:
2217       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2218       break;
2219     case CCValAssign::AExt:
2220       if (Arg.getValueType().isVector() &&
2221           Arg.getValueType().getVectorElementType() == MVT::i1)
2222         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
2223       else if (RegVT.is128BitVector()) {
2224         // Special case: passing MMX values in XMM registers.
2225         Arg = DAG.getBitcast(MVT::i64, Arg);
2226         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2227         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2228       } else
2229         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2230       break;
2231     case CCValAssign::BCvt:
2232       Arg = DAG.getBitcast(RegVT, Arg);
2233       break;
2234     case CCValAssign::Indirect: {
2235       if (isByVal) {
2236         // Memcpy the argument to a temporary stack slot to prevent
2237         // the caller from seeing any modifications the callee may make
2238         // as guaranteed by the `byval` attribute.
2239         int FrameIdx = MF.getFrameInfo().CreateStackObject(
2240             Flags.getByValSize(),
2241             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
2242         SDValue StackSlot =
2243             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
2244         Chain =
2245             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
2246         // From now on treat this as a regular pointer
2247         Arg = StackSlot;
2248         isByVal = false;
2249       } else {
2250         // Store the argument.
2251         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2252         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2253         Chain = DAG.getStore(
2254             Chain, dl, Arg, SpillSlot,
2255             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2256         Arg = SpillSlot;
2257       }
2258       break;
2259     }
2260     }
2261 
2262     if (VA.needsCustom()) {
2263       assert(VA.getValVT() == MVT::v64i1 &&
2264              "Currently the only custom case is when we split v64i1 to 2 regs");
2265       // Split v64i1 value into two registers
2266       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
2267     } else if (VA.isRegLoc()) {
2268       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2269       const TargetOptions &Options = DAG.getTarget().Options;
2270       if (Options.EmitCallSiteInfo)
2271         CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), I);
2272       if (isVarArg && IsWin64) {
2273         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2274         // shadow reg if callee is a varargs function.
2275         Register ShadowReg;
2276         switch (VA.getLocReg()) {
2277         case X86::XMM0: ShadowReg = X86::RCX; break;
2278         case X86::XMM1: ShadowReg = X86::RDX; break;
2279         case X86::XMM2: ShadowReg = X86::R8; break;
2280         case X86::XMM3: ShadowReg = X86::R9; break;
2281         }
2282         if (ShadowReg)
2283           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2284       }
2285     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2286       assert(VA.isMemLoc());
2287       if (!StackPtr.getNode())
2288         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2289                                       getPointerTy(DAG.getDataLayout()));
2290       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2291                                              dl, DAG, VA, Flags, isByVal));
2292     }
2293   }
2294 
2295   if (!MemOpChains.empty())
2296     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2297 
2298   if (Subtarget.isPICStyleGOT()) {
2299     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2300     // GOT pointer (except regcall).
2301     if (!isTailCall) {
2302       // Indirect call with RegCall calling convertion may use up all the
2303       // general registers, so it is not suitable to bind EBX reister for
2304       // GOT address, just let register allocator handle it.
2305       if (CallConv != CallingConv::X86_RegCall)
2306         RegsToPass.push_back(std::make_pair(
2307           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2308                                           getPointerTy(DAG.getDataLayout()))));
2309     } else {
2310       // If we are tail calling and generating PIC/GOT style code load the
2311       // address of the callee into ECX. The value in ecx is used as target of
2312       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2313       // for tail calls on PIC/GOT architectures. Normally we would just put the
2314       // address of GOT into ebx and then call target@PLT. But for tail calls
2315       // ebx would be restored (since ebx is callee saved) before jumping to the
2316       // target@PLT.
2317 
2318       // Note: The actual moving to ECX is done further down.
2319       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2320       if (G && !G->getGlobal()->hasLocalLinkage() &&
2321           G->getGlobal()->hasDefaultVisibility())
2322         Callee = LowerGlobalAddress(Callee, DAG);
2323       else if (isa<ExternalSymbolSDNode>(Callee))
2324         Callee = LowerExternalSymbol(Callee, DAG);
2325     }
2326   }
2327 
2328   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2329       (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
2330     // From AMD64 ABI document:
2331     // For calls that may call functions that use varargs or stdargs
2332     // (prototype-less calls or calls to functions containing ellipsis (...) in
2333     // the declaration) %al is used as hidden argument to specify the number
2334     // of SSE registers used. The contents of %al do not need to match exactly
2335     // the number of registers, but must be an ubound on the number of SSE
2336     // registers used and is in the range 0 - 8 inclusive.
2337 
2338     // Count the number of XMM registers allocated.
2339     static const MCPhysReg XMMArgRegs[] = {
2340       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2341       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2342     };
2343     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
2344     assert((Subtarget.hasSSE1() || !NumXMMRegs)
2345            && "SSE registers cannot be used when SSE is disabled");
2346     RegsToPass.push_back(std::make_pair(Register(X86::AL),
2347                                         DAG.getConstant(NumXMMRegs, dl,
2348                                                         MVT::i8)));
2349   }
2350 
2351   if (isVarArg && IsMustTail) {
2352     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2353     for (const auto &F : Forwards) {
2354       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2355       RegsToPass.push_back(std::make_pair(F.PReg, Val));
2356     }
2357   }
2358 
2359   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
2360   // don't need this because the eligibility check rejects calls that require
2361   // shuffling arguments passed in memory.
2362   if (!IsSibcall && isTailCall) {
2363     // Force all the incoming stack arguments to be loaded from the stack
2364     // before any new outgoing arguments or the return address are stored to the
2365     // stack, because the outgoing stack slots may alias the incoming argument
2366     // stack slots, and the alias isn't otherwise explicit. This is slightly
2367     // more conservative than necessary, because it means that each store
2368     // effectively depends on every argument instead of just those arguments it
2369     // would clobber.
2370     Chain = DAG.getStackArgumentTokenFactor(Chain);
2371 
2372     SmallVector<SDValue, 8> MemOpChains2;
2373     SDValue FIN;
2374     int FI = 0;
2375     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
2376          ++I, ++OutsIndex) {
2377       CCValAssign &VA = ArgLocs[I];
2378 
2379       if (VA.isRegLoc()) {
2380         if (VA.needsCustom()) {
2381           assert((CallConv == CallingConv::X86_RegCall) &&
2382                  "Expecting custom case only in regcall calling convention");
2383           // This means that we are in special case where one argument was
2384           // passed through two register locations - Skip the next location
2385           ++I;
2386         }
2387 
2388         continue;
2389       }
2390 
2391       assert(VA.isMemLoc());
2392       SDValue Arg = OutVals[OutsIndex];
2393       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
2394       // Skip inalloca/preallocated arguments.  They don't require any work.
2395       if (Flags.isInAlloca() || Flags.isPreallocated())
2396         continue;
2397       // Create frame index.
2398       int32_t Offset = VA.getLocMemOffset()+FPDiff;
2399       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2400       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
2401       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2402 
2403       if (Flags.isByVal()) {
2404         // Copy relative to framepointer.
2405         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
2406         if (!StackPtr.getNode())
2407           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2408                                         getPointerTy(DAG.getDataLayout()));
2409         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2410                              StackPtr, Source);
2411 
2412         MemOpChains2.push_back(
2413             CreateCopyOfByValArgument(Source, FIN, Chain, Flags, DAG, dl));
2414       } else {
2415         // Store relative to framepointer.
2416         MemOpChains2.push_back(DAG.getStore(
2417             Chain, dl, Arg, FIN,
2418             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
2419       }
2420     }
2421 
2422     if (!MemOpChains2.empty())
2423       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
2424 
2425     // Store the return address to the appropriate stack slot.
2426     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2427                                      getPointerTy(DAG.getDataLayout()),
2428                                      RegInfo->getSlotSize(), FPDiff, dl);
2429   }
2430 
2431   // Build a sequence of copy-to-reg nodes chained together with token chain
2432   // and glue operands which copy the outgoing args into registers.
2433   SDValue InGlue;
2434   for (const auto &[Reg, N] : RegsToPass) {
2435     Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2436     InGlue = Chain.getValue(1);
2437   }
2438 
2439   bool IsImpCall = false;
2440   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2441     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2442     // In the 64-bit large code model, we have to make all calls
2443     // through a register, since the call instruction's 32-bit
2444     // pc-relative offset may not be large enough to hold the whole
2445     // address.
2446   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
2447              Callee->getOpcode() == ISD::ExternalSymbol) {
2448     // Lower direct calls to global addresses and external symbols. Setting
2449     // ForCall to true here has the effect of removing WrapperRIP when possible
2450     // to allow direct calls to be selected without first materializing the
2451     // address into a register.
2452     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true, &IsImpCall);
2453   } else if (Subtarget.isTarget64BitILP32() &&
2454              Callee.getValueType() == MVT::i32) {
2455     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2456     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
2457   }
2458 
2459   SmallVector<SDValue, 8> Ops;
2460 
2461   if (!IsSibcall && isTailCall && !IsMustTail) {
2462     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
2463     InGlue = Chain.getValue(1);
2464   }
2465 
2466   Ops.push_back(Chain);
2467   Ops.push_back(Callee);
2468 
2469   if (isTailCall)
2470     Ops.push_back(DAG.getSignedTargetConstant(FPDiff, dl, MVT::i32));
2471 
2472   // Add argument registers to the end of the list so that they are known live
2473   // into the call.
2474   for (const auto &[Reg, N] : RegsToPass)
2475     Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2476 
2477   // Add a register mask operand representing the call-preserved registers.
2478   const uint32_t *Mask = [&]() {
2479     auto AdaptedCC = CallConv;
2480     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2481     // use X86_INTR calling convention because it has the same CSR mask
2482     // (same preserved registers).
2483     if (HasNCSR)
2484       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2485     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
2486     // to use the CSR_NoRegs_RegMask.
2487     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
2488       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2489     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2490   }();
2491   assert(Mask && "Missing call preserved mask for calling convention");
2492 
2493   if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getFramePtr())) {
2494     X86Info->setFPClobberedByCall(true);
2495     if (CLI.CB && isa<InvokeInst>(CLI.CB))
2496       X86Info->setFPClobberedByInvoke(true);
2497   }
2498   if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getBaseRegister())) {
2499     X86Info->setBPClobberedByCall(true);
2500     if (CLI.CB && isa<InvokeInst>(CLI.CB))
2501       X86Info->setBPClobberedByInvoke(true);
2502   }
2503 
2504   // If this is an invoke in a 32-bit function using a funclet-based
2505   // personality, assume the function clobbers all registers. If an exception
2506   // is thrown, the runtime will not restore CSRs.
2507   // FIXME: Model this more precisely so that we can register allocate across
2508   // the normal edge and spill and fill across the exceptional edge.
2509   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
2510     const Function &CallerFn = MF.getFunction();
2511     EHPersonality Pers =
2512         CallerFn.hasPersonalityFn()
2513             ? classifyEHPersonality(CallerFn.getPersonalityFn())
2514             : EHPersonality::Unknown;
2515     if (isFuncletEHPersonality(Pers))
2516       Mask = RegInfo->getNoPreservedMask();
2517   }
2518 
2519   // Define a new register mask from the existing mask.
2520   uint32_t *RegMask = nullptr;
2521 
2522   // In some calling conventions we need to remove the used physical registers
2523   // from the reg mask. Create a new RegMask for such calling conventions.
2524   // RegMask for calling conventions that disable only return registers (e.g.
2525   // preserve_most) will be modified later in LowerCallResult.
2526   bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
2527   if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
2528     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2529 
2530     // Allocate a new Reg Mask and copy Mask.
2531     RegMask = MF.allocateRegMask();
2532     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
2533     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
2534 
2535     // Make sure all sub registers of the argument registers are reset
2536     // in the RegMask.
2537     if (ShouldDisableArgRegs) {
2538       for (auto const &RegPair : RegsToPass)
2539         for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
2540           RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
2541     }
2542 
2543     // Create the RegMask Operand according to our updated mask.
2544     Ops.push_back(DAG.getRegisterMask(RegMask));
2545   } else {
2546     // Create the RegMask Operand according to the static mask.
2547     Ops.push_back(DAG.getRegisterMask(Mask));
2548   }
2549 
2550   if (InGlue.getNode())
2551     Ops.push_back(InGlue);
2552 
2553   if (isTailCall) {
2554     // We used to do:
2555     //// If this is the first return lowered for this function, add the regs
2556     //// to the liveout set for the function.
2557     // This isn't right, although it's probably harmless on x86; liveouts
2558     // should be computed from returns not tail calls.  Consider a void
2559     // function making a tail call to a function returning int.
2560     MF.getFrameInfo().setHasTailCall();
2561     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, Ops);
2562 
2563     if (IsCFICall)
2564       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2565 
2566     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2567     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2568     return Ret;
2569   }
2570 
2571   // Returns a chain & a glue for retval copy to use.
2572   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2573   if (IsImpCall) {
2574     Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops);
2575   } else if (IsNoTrackIndirectCall) {
2576     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
2577   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
2578     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
2579     // expanded to the call, directly followed by a special marker sequence and
2580     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2581     assert(!isTailCall &&
2582            "tail calls cannot be marked with clang.arc.attachedcall");
2583     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2584 
2585     // Add a target global address for the retainRV/claimRV runtime function
2586     // just before the call target.
2587     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
2588     auto PtrVT = getPointerTy(DAG.getDataLayout());
2589     auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
2590     Ops.insert(Ops.begin() + 1, GA);
2591     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
2592   } else {
2593     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
2594   }
2595 
2596   if (IsCFICall)
2597     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2598 
2599   InGlue = Chain.getValue(1);
2600   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2601   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2602 
2603   // Save heapallocsite metadata.
2604   if (CLI.CB)
2605     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
2606       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
2607 
2608   // Create the CALLSEQ_END node.
2609   unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
2610   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2611                        DAG.getTarget().Options.GuaranteedTailCallOpt))
2612     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
2613   else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
2614     // If this call passes a struct-return pointer, the callee
2615     // pops that struct pointer.
2616     NumBytesForCalleeToPop = 4;
2617 
2618   // Returns a glue for retval copy to use.
2619   if (!IsSibcall) {
2620     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
2621                                InGlue, dl);
2622     InGlue = Chain.getValue(1);
2623   }
2624 
2625   if (CallingConv::PreserveNone == CallConv)
2626     for (const ISD::OutputArg &Out : Outs) {
2627       if (Out.Flags.isSwiftSelf() || Out.Flags.isSwiftAsync() ||
2628           Out.Flags.isSwiftError()) {
2629         errorUnsupported(DAG, dl,
2630                          "Swift attributes can't be used with preserve_none");
2631         break;
2632       }
2633     }
2634 
2635   // Handle result values, copying them out of physregs into vregs that we
2636   // return.
2637   return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2638                          InVals, RegMask);
2639 }
2640 
2641 //===----------------------------------------------------------------------===//
2642 //                Fast Calling Convention (tail call) implementation
2643 //===----------------------------------------------------------------------===//
2644 
2645 //  Like std call, callee cleans arguments, convention except that ECX is
2646 //  reserved for storing the tail called function address. Only 2 registers are
2647 //  free for argument passing (inreg). Tail call optimization is performed
2648 //  provided:
2649 //                * tailcallopt is enabled
2650 //                * caller/callee are fastcc
2651 //  On X86_64 architecture with GOT-style position independent code only local
2652 //  (within module) calls are supported at the moment.
2653 //  To keep the stack aligned according to platform abi the function
2654 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2655 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2656 //  If a tail called function callee has more arguments than the caller the
2657 //  caller needs to make sure that there is room to move the RETADDR to. This is
2658 //  achieved by reserving an area the size of the argument delta right after the
2659 //  original RETADDR, but before the saved framepointer or the spilled registers
2660 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2661 //  stack layout:
2662 //    arg1
2663 //    arg2
2664 //    RETADDR
2665 //    [ new RETADDR
2666 //      move area ]
2667 //    (possible EBP)
2668 //    ESI
2669 //    EDI
2670 //    local1 ..
2671 
2672 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2673 /// requirement.
2674 unsigned
GetAlignedArgumentStackSize(const unsigned StackSize,SelectionDAG & DAG) const2675 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2676                                                SelectionDAG &DAG) const {
2677   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2678   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2679   assert(StackSize % SlotSize == 0 &&
2680          "StackSize must be a multiple of SlotSize");
2681   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
2682 }
2683 
2684 /// Return true if the given stack call argument is already available in the
2685 /// same position (relatively) of the caller's incoming argument stack.
2686 static
MatchingStackOffset(SDValue Arg,unsigned Offset,ISD::ArgFlagsTy Flags,MachineFrameInfo & MFI,const MachineRegisterInfo * MRI,const X86InstrInfo * TII,const CCValAssign & VA)2687 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2688                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2689                          const X86InstrInfo *TII, const CCValAssign &VA) {
2690   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2691 
2692   for (;;) {
2693     // Look through nodes that don't alter the bits of the incoming value.
2694     unsigned Op = Arg.getOpcode();
2695     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
2696         Op == ISD::AssertZext) {
2697       Arg = Arg.getOperand(0);
2698       continue;
2699     }
2700     if (Op == ISD::TRUNCATE) {
2701       const SDValue &TruncInput = Arg.getOperand(0);
2702       if (TruncInput.getOpcode() == ISD::AssertZext &&
2703           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
2704               Arg.getValueType()) {
2705         Arg = TruncInput.getOperand(0);
2706         continue;
2707       }
2708     }
2709     break;
2710   }
2711 
2712   int FI = INT_MAX;
2713   if (Arg.getOpcode() == ISD::CopyFromReg) {
2714     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2715     if (!VR.isVirtual())
2716       return false;
2717     MachineInstr *Def = MRI->getVRegDef(VR);
2718     if (!Def)
2719       return false;
2720     if (!Flags.isByVal()) {
2721       if (!TII->isLoadFromStackSlot(*Def, FI))
2722         return false;
2723     } else {
2724       unsigned Opcode = Def->getOpcode();
2725       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
2726            Opcode == X86::LEA64_32r) &&
2727           Def->getOperand(1).isFI()) {
2728         FI = Def->getOperand(1).getIndex();
2729         Bytes = Flags.getByValSize();
2730       } else
2731         return false;
2732     }
2733   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2734     if (Flags.isByVal())
2735       // ByVal argument is passed in as a pointer but it's now being
2736       // dereferenced. e.g.
2737       // define @foo(%struct.X* %A) {
2738       //   tail call @bar(%struct.X* byval %A)
2739       // }
2740       return false;
2741     SDValue Ptr = Ld->getBasePtr();
2742     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2743     if (!FINode)
2744       return false;
2745     FI = FINode->getIndex();
2746   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2747     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2748     FI = FINode->getIndex();
2749     Bytes = Flags.getByValSize();
2750   } else
2751     return false;
2752 
2753   assert(FI != INT_MAX);
2754   if (!MFI.isFixedObjectIndex(FI))
2755     return false;
2756 
2757   if (Offset != MFI.getObjectOffset(FI))
2758     return false;
2759 
2760   // If this is not byval, check that the argument stack object is immutable.
2761   // inalloca and argument copy elision can create mutable argument stack
2762   // objects. Byval objects can be mutated, but a byval call intends to pass the
2763   // mutated memory.
2764   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
2765     return false;
2766 
2767   if (VA.getLocVT().getFixedSizeInBits() >
2768       Arg.getValueSizeInBits().getFixedValue()) {
2769     // If the argument location is wider than the argument type, check that any
2770     // extension flags match.
2771     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
2772         Flags.isSExt() != MFI.isObjectSExt(FI)) {
2773       return false;
2774     }
2775   }
2776 
2777   return Bytes == MFI.getObjectSize(FI);
2778 }
2779 
2780 /// Check whether the call is eligible for tail call optimization. Targets
2781 /// that want to do tail call optimization should implement this function.
2782 /// Note that the x86 backend does not check musttail calls for eligibility! The
2783 /// rest of x86 tail call lowering must be prepared to forward arguments of any
2784 /// type.
IsEligibleForTailCallOptimization(TargetLowering::CallLoweringInfo & CLI,CCState & CCInfo,SmallVectorImpl<CCValAssign> & ArgLocs,bool IsCalleePopSRet) const2785 bool X86TargetLowering::IsEligibleForTailCallOptimization(
2786     TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
2787     SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const {
2788   SelectionDAG &DAG = CLI.DAG;
2789   const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2790   const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2791   const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2792   SDValue Callee = CLI.Callee;
2793   CallingConv::ID CalleeCC = CLI.CallConv;
2794   bool isVarArg = CLI.IsVarArg;
2795 
2796   if (!mayTailCallThisCC(CalleeCC))
2797     return false;
2798 
2799   // If -tailcallopt is specified, make fastcc functions tail-callable.
2800   MachineFunction &MF = DAG.getMachineFunction();
2801   const Function &CallerF = MF.getFunction();
2802 
2803   // If the function return type is x86_fp80 and the callee return type is not,
2804   // then the FP_EXTEND of the call result is not a nop. It's not safe to
2805   // perform a tailcall optimization here.
2806   if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty())
2807     return false;
2808 
2809   CallingConv::ID CallerCC = CallerF.getCallingConv();
2810   bool CCMatch = CallerCC == CalleeCC;
2811   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
2812   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
2813   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
2814       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
2815 
2816   // Win64 functions have extra shadow space for argument homing. Don't do the
2817   // sibcall if the caller and callee have mismatched expectations for this
2818   // space.
2819   if (IsCalleeWin64 != IsCallerWin64)
2820     return false;
2821 
2822   if (IsGuaranteeTCO) {
2823     if (canGuaranteeTCO(CalleeCC) && CCMatch)
2824       return true;
2825     return false;
2826   }
2827 
2828   // Look for obvious safe cases to perform tail call optimization that do not
2829   // require ABI changes. This is what gcc calls sibcall.
2830 
2831   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2832   // emit a special epilogue.
2833   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2834   if (RegInfo->hasStackRealignment(MF))
2835     return false;
2836 
2837   // Also avoid sibcall optimization if we're an sret return fn and the callee
2838   // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
2839   // insufficient.
2840   if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
2841     // For a compatible tail call the callee must return our sret pointer. So it
2842     // needs to be (a) an sret function itself and (b) we pass our sret as its
2843     // sret. Condition #b is harder to determine.
2844     return false;
2845   } else if (IsCalleePopSRet)
2846     // The callee pops an sret, so we cannot tail-call, as our caller doesn't
2847     // expect that.
2848     return false;
2849 
2850   // Do not sibcall optimize vararg calls unless all arguments are passed via
2851   // registers.
2852   LLVMContext &C = *DAG.getContext();
2853   if (isVarArg && !Outs.empty()) {
2854     // Optimizing for varargs on Win64 is unlikely to be safe without
2855     // additional testing.
2856     if (IsCalleeWin64 || IsCallerWin64)
2857       return false;
2858 
2859     for (const auto &VA : ArgLocs)
2860       if (!VA.isRegLoc())
2861         return false;
2862   }
2863 
2864   // If the call result is in ST0 / ST1, it needs to be popped off the x87
2865   // stack.  Therefore, if it's not used by the call it is not safe to optimize
2866   // this into a sibcall.
2867   bool Unused = false;
2868   for (const auto &In : Ins) {
2869     if (!In.Used) {
2870       Unused = true;
2871       break;
2872     }
2873   }
2874   if (Unused) {
2875     SmallVector<CCValAssign, 16> RVLocs;
2876     CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C);
2877     RVCCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2878     for (const auto &VA : RVLocs) {
2879       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
2880         return false;
2881     }
2882   }
2883 
2884   // Check that the call results are passed in the same way.
2885   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2886                                   RetCC_X86, RetCC_X86))
2887     return false;
2888   // The callee has to preserve all registers the caller needs to preserve.
2889   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2890   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2891   if (!CCMatch) {
2892     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2893     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2894       return false;
2895   }
2896 
2897   // The stack frame of the caller cannot be replaced by the tail-callee one's
2898   // if the function is required to preserve all the registers. Conservatively
2899   // prevent tail optimization even if hypothetically all the registers are used
2900   // for passing formal parameters or returning values.
2901   if (CallerF.hasFnAttribute("no_caller_saved_registers"))
2902     return false;
2903 
2904   unsigned StackArgsSize = CCInfo.getStackSize();
2905 
2906   // If the callee takes no arguments then go on to check the results of the
2907   // call.
2908   if (!Outs.empty()) {
2909     if (StackArgsSize > 0) {
2910       // Check if the arguments are already laid out in the right way as
2911       // the caller's fixed stack objects.
2912       MachineFrameInfo &MFI = MF.getFrameInfo();
2913       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2914       const X86InstrInfo *TII = Subtarget.getInstrInfo();
2915       for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
2916         const CCValAssign &VA = ArgLocs[I];
2917         SDValue Arg = OutVals[I];
2918         ISD::ArgFlagsTy Flags = Outs[I].Flags;
2919         if (VA.getLocInfo() == CCValAssign::Indirect)
2920           return false;
2921         if (!VA.isRegLoc()) {
2922           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
2923                                    TII, VA))
2924             return false;
2925         }
2926       }
2927     }
2928 
2929     bool PositionIndependent = isPositionIndependent();
2930     // If the tailcall address may be in a register, then make sure it's
2931     // possible to register allocate for it. In 32-bit, the call address can
2932     // only target EAX, EDX, or ECX since the tail call must be scheduled after
2933     // callee-saved registers are restored. These happen to be the same
2934     // registers used to pass 'inreg' arguments so watch out for those.
2935     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
2936                                   !isa<ExternalSymbolSDNode>(Callee)) ||
2937                                  PositionIndependent)) {
2938       unsigned NumInRegs = 0;
2939       // In PIC we need an extra register to formulate the address computation
2940       // for the callee.
2941       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
2942 
2943       for (const auto &VA : ArgLocs) {
2944         if (!VA.isRegLoc())
2945           continue;
2946         Register Reg = VA.getLocReg();
2947         switch (Reg) {
2948         default: break;
2949         case X86::EAX: case X86::EDX: case X86::ECX:
2950           if (++NumInRegs == MaxInRegs)
2951             return false;
2952           break;
2953         }
2954       }
2955     }
2956 
2957     const MachineRegisterInfo &MRI = MF.getRegInfo();
2958     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2959       return false;
2960   }
2961 
2962   bool CalleeWillPop =
2963       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
2964                        MF.getTarget().Options.GuaranteedTailCallOpt);
2965 
2966   if (unsigned BytesToPop =
2967           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
2968     // If we have bytes to pop, the callee must pop them.
2969     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
2970     if (!CalleePopMatches)
2971       return false;
2972   } else if (CalleeWillPop && StackArgsSize > 0) {
2973     // If we don't have bytes to pop, make sure the callee doesn't pop any.
2974     return false;
2975   }
2976 
2977   return true;
2978 }
2979 
2980 /// Determines whether the callee is required to pop its own arguments.
2981 /// Callee pop is necessary to support tail calls.
isCalleePop(CallingConv::ID CallingConv,bool is64Bit,bool IsVarArg,bool GuaranteeTCO)2982 bool X86::isCalleePop(CallingConv::ID CallingConv,
2983                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
2984   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
2985   // can guarantee TCO.
2986   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
2987     return true;
2988 
2989   switch (CallingConv) {
2990   default:
2991     return false;
2992   case CallingConv::X86_StdCall:
2993   case CallingConv::X86_FastCall:
2994   case CallingConv::X86_ThisCall:
2995   case CallingConv::X86_VectorCall:
2996     return !is64Bit;
2997   }
2998 }
2999