xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLoweringCall.cpp (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 //===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file implements the lowering of LLVM calls to DAG nodes.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86.h"
15 #include "X86CallingConv.h"
16 #include "X86FrameLowering.h"
17 #include "X86ISelLowering.h"
18 #include "X86InstrBuilder.h"
19 #include "X86MachineFunctionInfo.h"
20 #include "X86TargetMachine.h"
21 #include "X86TargetObjectFile.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/ObjCARCUtil.h"
24 #include "llvm/CodeGen/MachineJumpTableInfo.h"
25 #include "llvm/CodeGen/MachineModuleInfo.h"
26 #include "llvm/CodeGen/WinEHFuncInfo.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IRBuilder.h"
29 #include "llvm/IR/Module.h"
30 
31 #define DEBUG_TYPE "x86-isel"
32 
33 using namespace llvm;
34 
35 STATISTIC(NumTailCalls, "Number of tail calls");
36 
37 /// Call this when the user attempts to do something unsupported, like
38 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
39 /// report_fatal_error, so calling code should attempt to recover without
40 /// crashing.
41 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
42                              const char *Msg) {
43   MachineFunction &MF = DAG.getMachineFunction();
44   DAG.getContext()->diagnose(
45       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
46 }
47 
48 /// Returns true if a CC can dynamically exclude a register from the list of
49 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
50 /// the return registers.
51 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
52   switch (CC) {
53   default:
54     return false;
55   case CallingConv::X86_RegCall:
56   case CallingConv::PreserveMost:
57   case CallingConv::PreserveAll:
58     return true;
59   }
60 }
61 
62 /// Returns true if a CC can dynamically exclude a register from the list of
63 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
64 /// the parameters.
65 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
66   return CC == CallingConv::X86_RegCall;
67 }
68 
69 static std::pair<MVT, unsigned>
70 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
71                                  const X86Subtarget &Subtarget) {
72   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
73   // convention is one that uses k registers.
74   if (NumElts == 2)
75     return {MVT::v2i64, 1};
76   if (NumElts == 4)
77     return {MVT::v4i32, 1};
78   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
79       CC != CallingConv::Intel_OCL_BI)
80     return {MVT::v8i16, 1};
81   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
82       CC != CallingConv::Intel_OCL_BI)
83     return {MVT::v16i8, 1};
84   // v32i1 passes in ymm unless we have BWI and the calling convention is
85   // regcall.
86   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
87     return {MVT::v32i8, 1};
88   // Split v64i1 vectors if we don't have v64i8 available.
89   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
90     if (Subtarget.useAVX512Regs())
91       return {MVT::v64i8, 1};
92     return {MVT::v32i8, 2};
93   }
94 
95   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
96   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
97       NumElts > 64)
98     return {MVT::i8, NumElts};
99 
100   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
101 }
102 
103 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
104                                                      CallingConv::ID CC,
105                                                      EVT VT) const {
106   if (VT.isVector()) {
107     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
108       unsigned NumElts = VT.getVectorNumElements();
109 
110       MVT RegisterVT;
111       unsigned NumRegisters;
112       std::tie(RegisterVT, NumRegisters) =
113           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
114       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
115         return RegisterVT;
116     }
117 
118     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
119       return MVT::v8f16;
120   }
121 
122   // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
123   if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
124       !Subtarget.hasX87())
125     return MVT::i32;
126 
127   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
128     return getRegisterTypeForCallingConv(Context, CC,
129                                          VT.changeVectorElementType(MVT::f16));
130 
131   if (VT == MVT::bf16)
132     return MVT::f16;
133 
134   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
135 }
136 
137 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
138                                                           CallingConv::ID CC,
139                                                           EVT VT) const {
140   if (VT.isVector()) {
141     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
142       unsigned NumElts = VT.getVectorNumElements();
143 
144       MVT RegisterVT;
145       unsigned NumRegisters;
146       std::tie(RegisterVT, NumRegisters) =
147           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
148       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
149         return NumRegisters;
150     }
151 
152     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
153       return 1;
154   }
155 
156   // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
157   // x87 is disabled.
158   if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
159     if (VT == MVT::f64)
160       return 2;
161     if (VT == MVT::f80)
162       return 3;
163   }
164 
165   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
166     return getNumRegistersForCallingConv(Context, CC,
167                                          VT.changeVectorElementType(MVT::f16));
168 
169   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
170 }
171 
172 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
173     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
174     unsigned &NumIntermediates, MVT &RegisterVT) const {
175   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
176   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
177       Subtarget.hasAVX512() &&
178       (!isPowerOf2_32(VT.getVectorNumElements()) ||
179        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
180        VT.getVectorNumElements() > 64)) {
181     RegisterVT = MVT::i8;
182     IntermediateVT = MVT::i1;
183     NumIntermediates = VT.getVectorNumElements();
184     return NumIntermediates;
185   }
186 
187   // Split v64i1 vectors if we don't have v64i8 available.
188   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
189       CC != CallingConv::X86_RegCall) {
190     RegisterVT = MVT::v32i8;
191     IntermediateVT = MVT::v32i1;
192     NumIntermediates = 2;
193     return 2;
194   }
195 
196   // Split vNbf16 vectors according to vNf16.
197   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
198     VT = VT.changeVectorElementType(MVT::f16);
199 
200   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
201                                               NumIntermediates, RegisterVT);
202 }
203 
204 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
205                                           LLVMContext& Context,
206                                           EVT VT) const {
207   if (!VT.isVector())
208     return MVT::i8;
209 
210   if (Subtarget.hasAVX512()) {
211     // Figure out what this type will be legalized to.
212     EVT LegalVT = VT;
213     while (getTypeAction(Context, LegalVT) != TypeLegal)
214       LegalVT = getTypeToTransformTo(Context, LegalVT);
215 
216     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
217     if (LegalVT.getSimpleVT().is512BitVector())
218       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
219 
220     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
221       // If we legalized to less than a 512-bit vector, then we will use a vXi1
222       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
223       // vXi16/vXi8.
224       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
225       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
226         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
227     }
228   }
229 
230   return VT.changeVectorElementTypeToInteger();
231 }
232 
233 /// Helper for getByValTypeAlignment to determine
234 /// the desired ByVal argument alignment.
235 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
236   if (MaxAlign == 16)
237     return;
238   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
239     if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
240       MaxAlign = Align(16);
241   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
242     Align EltAlign;
243     getMaxByValAlign(ATy->getElementType(), EltAlign);
244     if (EltAlign > MaxAlign)
245       MaxAlign = EltAlign;
246   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
247     for (auto *EltTy : STy->elements()) {
248       Align EltAlign;
249       getMaxByValAlign(EltTy, EltAlign);
250       if (EltAlign > MaxAlign)
251         MaxAlign = EltAlign;
252       if (MaxAlign == 16)
253         break;
254     }
255   }
256 }
257 
258 /// Return the desired alignment for ByVal aggregate
259 /// function arguments in the caller parameter area. For X86, aggregates
260 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
261 /// are at 4-byte boundaries.
262 uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
263                                                   const DataLayout &DL) const {
264   if (Subtarget.is64Bit()) {
265     // Max of 8 and alignment of type.
266     Align TyAlign = DL.getABITypeAlign(Ty);
267     if (TyAlign > 8)
268       return TyAlign.value();
269     return 8;
270   }
271 
272   Align Alignment(4);
273   if (Subtarget.hasSSE1())
274     getMaxByValAlign(Ty, Alignment);
275   return Alignment.value();
276 }
277 
278 /// It returns EVT::Other if the type should be determined using generic
279 /// target-independent logic.
280 /// For vector ops we check that the overall size isn't larger than our
281 /// preferred vector width.
282 EVT X86TargetLowering::getOptimalMemOpType(
283     const MemOp &Op, const AttributeList &FuncAttributes) const {
284   if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
285     if (Op.size() >= 16 &&
286         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
287       // FIXME: Check if unaligned 64-byte accesses are slow.
288       if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
289           (Subtarget.getPreferVectorWidth() >= 512)) {
290         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
291       }
292       // FIXME: Check if unaligned 32-byte accesses are slow.
293       if (Op.size() >= 32 && Subtarget.hasAVX() &&
294           Subtarget.useLight256BitInstructions()) {
295         // Although this isn't a well-supported type for AVX1, we'll let
296         // legalization and shuffle lowering produce the optimal codegen. If we
297         // choose an optimal type with a vector element larger than a byte,
298         // getMemsetStores() may create an intermediate splat (using an integer
299         // multiply) before we splat as a vector.
300         return MVT::v32i8;
301       }
302       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
303         return MVT::v16i8;
304       // TODO: Can SSE1 handle a byte vector?
305       // If we have SSE1 registers we should be able to use them.
306       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
307           (Subtarget.getPreferVectorWidth() >= 128))
308         return MVT::v4f32;
309     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
310                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
311       // Do not use f64 to lower memcpy if source is string constant. It's
312       // better to use i32 to avoid the loads.
313       // Also, do not use f64 to lower memset unless this is a memset of zeros.
314       // The gymnastics of splatting a byte value into an XMM register and then
315       // only using 8-byte stores (because this is a CPU with slow unaligned
316       // 16-byte accesses) makes that a loser.
317       return MVT::f64;
318     }
319   }
320   // This is a compromise. If we reach here, unaligned accesses may be slow on
321   // this target. However, creating smaller, aligned accesses could be even
322   // slower and would certainly be a lot more code.
323   if (Subtarget.is64Bit() && Op.size() >= 8)
324     return MVT::i64;
325   return MVT::i32;
326 }
327 
328 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
329   if (VT == MVT::f32)
330     return Subtarget.hasSSE1();
331   if (VT == MVT::f64)
332     return Subtarget.hasSSE2();
333   return true;
334 }
335 
336 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
337   return (8 * Alignment.value()) % SizeInBits == 0;
338 }
339 
340 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
341   if (isBitAligned(Alignment, VT.getSizeInBits()))
342     return true;
343   switch (VT.getSizeInBits()) {
344   default:
345     // 8-byte and under are always assumed to be fast.
346     return true;
347   case 128:
348     return !Subtarget.isUnalignedMem16Slow();
349   case 256:
350     return !Subtarget.isUnalignedMem32Slow();
351     // TODO: What about AVX-512 (512-bit) accesses?
352   }
353 }
354 
355 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
356     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
357     unsigned *Fast) const {
358   if (Fast)
359     *Fast = isMemoryAccessFast(VT, Alignment);
360   // NonTemporal vector memory ops must be aligned.
361   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
362     // NT loads can only be vector aligned, so if its less aligned than the
363     // minimum vector size (which we can split the vector down to), we might as
364     // well use a regular unaligned vector load.
365     // We don't have any NT loads pre-SSE41.
366     if (!!(Flags & MachineMemOperand::MOLoad))
367       return (Alignment < 16 || !Subtarget.hasSSE41());
368     return false;
369   }
370   // Misaligned accesses of any size are always allowed.
371   return true;
372 }
373 
374 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
375                                            const DataLayout &DL, EVT VT,
376                                            unsigned AddrSpace, Align Alignment,
377                                            MachineMemOperand::Flags Flags,
378                                            unsigned *Fast) const {
379   if (Fast)
380     *Fast = isMemoryAccessFast(VT, Alignment);
381   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
382     if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
383                                        /*Fast=*/nullptr))
384       return true;
385     // NonTemporal vector memory ops are special, and must be aligned.
386     if (!isBitAligned(Alignment, VT.getSizeInBits()))
387       return false;
388     switch (VT.getSizeInBits()) {
389     case 128:
390       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
391         return true;
392       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
393         return true;
394       return false;
395     case 256:
396       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
397         return true;
398       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
399         return true;
400       return false;
401     case 512:
402       if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
403         return true;
404       return false;
405     default:
406       return false; // Don't have NonTemporal vector memory ops of this size.
407     }
408   }
409   return true;
410 }
411 
412 /// Return the entry encoding for a jump table in the
413 /// current function.  The returned value is a member of the
414 /// MachineJumpTableInfo::JTEntryKind enum.
415 unsigned X86TargetLowering::getJumpTableEncoding() const {
416   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
417   // symbol.
418   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
419     return MachineJumpTableInfo::EK_Custom32;
420   if (isPositionIndependent() &&
421       getTargetMachine().getCodeModel() == CodeModel::Large &&
422       !Subtarget.isTargetCOFF())
423     return MachineJumpTableInfo::EK_LabelDifference64;
424 
425   // Otherwise, use the normal jump table encoding heuristics.
426   return TargetLowering::getJumpTableEncoding();
427 }
428 
429 bool X86TargetLowering::useSoftFloat() const {
430   return Subtarget.useSoftFloat();
431 }
432 
433 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
434                                               ArgListTy &Args) const {
435 
436   // Only relabel X86-32 for C / Stdcall CCs.
437   if (Subtarget.is64Bit())
438     return;
439   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
440     return;
441   unsigned ParamRegs = 0;
442   if (auto *M = MF->getFunction().getParent())
443     ParamRegs = M->getNumberRegisterParameters();
444 
445   // Mark the first N int arguments as having reg
446   for (auto &Arg : Args) {
447     Type *T = Arg.Ty;
448     if (T->isIntOrPtrTy())
449       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
450         unsigned numRegs = 1;
451         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
452           numRegs = 2;
453         if (ParamRegs < numRegs)
454           return;
455         ParamRegs -= numRegs;
456         Arg.IsInReg = true;
457       }
458   }
459 }
460 
461 const MCExpr *
462 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
463                                              const MachineBasicBlock *MBB,
464                                              unsigned uid,MCContext &Ctx) const{
465   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
466   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
467   // entries.
468   return MCSymbolRefExpr::create(MBB->getSymbol(),
469                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
470 }
471 
472 /// Returns relocation base for the given PIC jumptable.
473 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
474                                                     SelectionDAG &DAG) const {
475   if (!Subtarget.is64Bit())
476     // This doesn't have SDLoc associated with it, but is not really the
477     // same as a Register.
478     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
479                        getPointerTy(DAG.getDataLayout()));
480   return Table;
481 }
482 
483 /// This returns the relocation base for the given PIC jumptable,
484 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
485 const MCExpr *X86TargetLowering::
486 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
487                              MCContext &Ctx) const {
488   // X86-64 uses RIP relative addressing based on the jump table label.
489   if (Subtarget.isPICStyleRIPRel() ||
490       (Subtarget.is64Bit() &&
491        getTargetMachine().getCodeModel() == CodeModel::Large))
492     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
493 
494   // Otherwise, the reference is relative to the PIC base.
495   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
496 }
497 
498 std::pair<const TargetRegisterClass *, uint8_t>
499 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
500                                            MVT VT) const {
501   const TargetRegisterClass *RRC = nullptr;
502   uint8_t Cost = 1;
503   switch (VT.SimpleTy) {
504   default:
505     return TargetLowering::findRepresentativeClass(TRI, VT);
506   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
507     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
508     break;
509   case MVT::x86mmx:
510     RRC = &X86::VR64RegClass;
511     break;
512   case MVT::f32: case MVT::f64:
513   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
514   case MVT::v4f32: case MVT::v2f64:
515   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
516   case MVT::v8f32: case MVT::v4f64:
517   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
518   case MVT::v16f32: case MVT::v8f64:
519     RRC = &X86::VR128XRegClass;
520     break;
521   }
522   return std::make_pair(RRC, Cost);
523 }
524 
525 unsigned X86TargetLowering::getAddressSpace() const {
526   if (Subtarget.is64Bit())
527     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
528   return 256;
529 }
530 
531 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
532   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
533          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
534 }
535 
536 static Constant* SegmentOffset(IRBuilderBase &IRB,
537                                int Offset, unsigned AddressSpace) {
538   return ConstantExpr::getIntToPtr(
539       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
540       IRB.getPtrTy(AddressSpace));
541 }
542 
543 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
544   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
545   // tcbhead_t; use it instead of the usual global variable (see
546   // sysdeps/{i386,x86_64}/nptl/tls.h)
547   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
548     unsigned AddressSpace = getAddressSpace();
549 
550     // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
551     if (Subtarget.isTargetFuchsia())
552       return SegmentOffset(IRB, 0x10, AddressSpace);
553 
554     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
555     // Specially, some users may customize the base reg and offset.
556     int Offset = M->getStackProtectorGuardOffset();
557     // If we don't set -stack-protector-guard-offset value:
558     // %fs:0x28, unless we're using a Kernel code model, in which case
559     // it's %gs:0x28.  gs:0x14 on i386.
560     if (Offset == INT_MAX)
561       Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
562 
563     StringRef GuardReg = M->getStackProtectorGuardReg();
564     if (GuardReg == "fs")
565       AddressSpace = X86AS::FS;
566     else if (GuardReg == "gs")
567       AddressSpace = X86AS::GS;
568 
569     // Use symbol guard if user specify.
570     StringRef GuardSymb = M->getStackProtectorGuardSymbol();
571     if (!GuardSymb.empty()) {
572       GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
573       if (!GV) {
574         Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
575                                        : Type::getInt32Ty(M->getContext());
576         GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
577                                 nullptr, GuardSymb, nullptr,
578                                 GlobalValue::NotThreadLocal, AddressSpace);
579         if (!Subtarget.isTargetDarwin())
580           GV->setDSOLocal(M->getDirectAccessExternalData());
581       }
582       return GV;
583     }
584 
585     return SegmentOffset(IRB, Offset, AddressSpace);
586   }
587   return TargetLowering::getIRStackGuard(IRB);
588 }
589 
590 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
591   // MSVC CRT provides functionalities for stack protection.
592   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
593       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
594     // MSVC CRT has a global variable holding security cookie.
595     M.getOrInsertGlobal("__security_cookie",
596                         PointerType::getUnqual(M.getContext()));
597 
598     // MSVC CRT has a function to validate security cookie.
599     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
600         "__security_check_cookie", Type::getVoidTy(M.getContext()),
601         PointerType::getUnqual(M.getContext()));
602     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
603       F->setCallingConv(CallingConv::X86_FastCall);
604       F->addParamAttr(0, Attribute::AttrKind::InReg);
605     }
606     return;
607   }
608 
609   StringRef GuardMode = M.getStackProtectorGuard();
610 
611   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
612   if ((GuardMode == "tls" || GuardMode.empty()) &&
613       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
614     return;
615   TargetLowering::insertSSPDeclarations(M);
616 }
617 
618 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
619   // MSVC CRT has a global variable holding security cookie.
620   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
621       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
622     return M.getGlobalVariable("__security_cookie");
623   }
624   return TargetLowering::getSDagStackGuard(M);
625 }
626 
627 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
628   // MSVC CRT has a function to validate security cookie.
629   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
630       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
631     return M.getFunction("__security_check_cookie");
632   }
633   return TargetLowering::getSSPStackGuardCheck(M);
634 }
635 
636 Value *
637 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
638   // Android provides a fixed TLS slot for the SafeStack pointer. See the
639   // definition of TLS_SLOT_SAFESTACK in
640   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
641   if (Subtarget.isTargetAndroid()) {
642     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
643     // %gs:0x24 on i386
644     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
645     return SegmentOffset(IRB, Offset, getAddressSpace());
646   }
647 
648   // Fuchsia is similar.
649   if (Subtarget.isTargetFuchsia()) {
650     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
651     return SegmentOffset(IRB, 0x18, getAddressSpace());
652   }
653 
654   return TargetLowering::getSafeStackPointerLocation(IRB);
655 }
656 
657 //===----------------------------------------------------------------------===//
658 //               Return Value Calling Convention Implementation
659 //===----------------------------------------------------------------------===//
660 
661 bool X86TargetLowering::CanLowerReturn(
662     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
663     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
664   SmallVector<CCValAssign, 16> RVLocs;
665   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
666   return CCInfo.CheckReturn(Outs, RetCC_X86);
667 }
668 
669 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
670   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
671   return ScratchRegs;
672 }
673 
674 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
675   static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR};
676   return RCRegs;
677 }
678 
679 /// Lowers masks values (v*i1) to the local register values
680 /// \returns DAG node after lowering to register type
681 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
682                                const SDLoc &DL, SelectionDAG &DAG) {
683   EVT ValVT = ValArg.getValueType();
684 
685   if (ValVT == MVT::v1i1)
686     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg,
687                        DAG.getIntPtrConstant(0, DL));
688 
689   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
690       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
691     // Two stage lowering might be required
692     // bitcast:   v8i1 -> i8 / v16i1 -> i16
693     // anyextend: i8   -> i32 / i16   -> i32
694     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
695     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
696     if (ValLoc == MVT::i32)
697       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy);
698     return ValToCopy;
699   }
700 
701   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
702       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
703     // One stage lowering is required
704     // bitcast:   v32i1 -> i32 / v64i1 -> i64
705     return DAG.getBitcast(ValLoc, ValArg);
706   }
707 
708   return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg);
709 }
710 
711 /// Breaks v64i1 value into two registers and adds the new node to the DAG
712 static void Passv64i1ArgInRegs(
713     const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
714     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
715     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
716   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
717   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
718   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
719   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
720          "The value should reside in two registers");
721 
722   // Before splitting the value we cast it to i64
723   Arg = DAG.getBitcast(MVT::i64, Arg);
724 
725   // Splitting the value into two i32 types
726   SDValue Lo, Hi;
727   std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32);
728 
729   // Attach the two i32 types into corresponding registers
730   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
731   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
732 }
733 
734 SDValue
735 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
736                                bool isVarArg,
737                                const SmallVectorImpl<ISD::OutputArg> &Outs,
738                                const SmallVectorImpl<SDValue> &OutVals,
739                                const SDLoc &dl, SelectionDAG &DAG) const {
740   MachineFunction &MF = DAG.getMachineFunction();
741   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
742 
743   // In some cases we need to disable registers from the default CSR list.
744   // For example, when they are used as return registers (preserve_* and X86's
745   // regcall) or for argument passing (X86's regcall).
746   bool ShouldDisableCalleeSavedRegister =
747       shouldDisableRetRegFromCSR(CallConv) ||
748       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
749 
750   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
751     report_fatal_error("X86 interrupts may not return any value");
752 
753   SmallVector<CCValAssign, 16> RVLocs;
754   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
755   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
756 
757   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
758   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
759        ++I, ++OutsIndex) {
760     CCValAssign &VA = RVLocs[I];
761     assert(VA.isRegLoc() && "Can only return in registers!");
762 
763     // Add the register to the CalleeSaveDisableRegs list.
764     if (ShouldDisableCalleeSavedRegister)
765       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
766 
767     SDValue ValToCopy = OutVals[OutsIndex];
768     EVT ValVT = ValToCopy.getValueType();
769 
770     // Promote values to the appropriate types.
771     if (VA.getLocInfo() == CCValAssign::SExt)
772       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
773     else if (VA.getLocInfo() == CCValAssign::ZExt)
774       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
775     else if (VA.getLocInfo() == CCValAssign::AExt) {
776       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
777         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
778       else
779         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
780     }
781     else if (VA.getLocInfo() == CCValAssign::BCvt)
782       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
783 
784     assert(VA.getLocInfo() != CCValAssign::FPExt &&
785            "Unexpected FP-extend for return value.");
786 
787     // Report an error if we have attempted to return a value via an XMM
788     // register and SSE was disabled.
789     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
790       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
791       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
792     } else if (!Subtarget.hasSSE2() &&
793                X86::FR64XRegClass.contains(VA.getLocReg()) &&
794                ValVT == MVT::f64) {
795       // When returning a double via an XMM register, report an error if SSE2 is
796       // not enabled.
797       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
798       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
799     }
800 
801     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
802     // the RET instruction and handled by the FP Stackifier.
803     if (VA.getLocReg() == X86::FP0 ||
804         VA.getLocReg() == X86::FP1) {
805       // If this is a copy from an xmm register to ST(0), use an FPExtend to
806       // change the value to the FP stack register class.
807       if (isScalarFPTypeInSSEReg(VA.getValVT()))
808         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
809       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
810       // Don't emit a copytoreg.
811       continue;
812     }
813 
814     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
815     // which is returned in RAX / RDX.
816     if (Subtarget.is64Bit()) {
817       if (ValVT == MVT::x86mmx) {
818         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
819           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
820           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
821                                   ValToCopy);
822           // If we don't have SSE2 available, convert to v4f32 so the generated
823           // register is legal.
824           if (!Subtarget.hasSSE2())
825             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
826         }
827       }
828     }
829 
830     if (VA.needsCustom()) {
831       assert(VA.getValVT() == MVT::v64i1 &&
832              "Currently the only custom case is when we split v64i1 to 2 regs");
833 
834       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
835                          Subtarget);
836 
837       // Add the second register to the CalleeSaveDisableRegs list.
838       if (ShouldDisableCalleeSavedRegister)
839         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
840     } else {
841       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
842     }
843   }
844 
845   SDValue Glue;
846   SmallVector<SDValue, 6> RetOps;
847   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
848   // Operand #1 = Bytes To Pop
849   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
850                    MVT::i32));
851 
852   // Copy the result values into the output registers.
853   for (auto &RetVal : RetVals) {
854     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
855       RetOps.push_back(RetVal.second);
856       continue; // Don't emit a copytoreg.
857     }
858 
859     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
860     Glue = Chain.getValue(1);
861     RetOps.push_back(
862         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
863   }
864 
865   // Swift calling convention does not require we copy the sret argument
866   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
867 
868   // All x86 ABIs require that for returning structs by value we copy
869   // the sret argument into %rax/%eax (depending on ABI) for the return.
870   // We saved the argument into a virtual register in the entry block,
871   // so now we copy the value out and into %rax/%eax.
872   //
873   // Checking Function.hasStructRetAttr() here is insufficient because the IR
874   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
875   // false, then an sret argument may be implicitly inserted in the SelDAG. In
876   // either case FuncInfo->setSRetReturnReg() will have been called.
877   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
878     // When we have both sret and another return value, we should use the
879     // original Chain stored in RetOps[0], instead of the current Chain updated
880     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
881 
882     // For the case of sret and another return value, we have
883     //   Chain_0 at the function entry
884     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
885     // If we use Chain_1 in getCopyFromReg, we will have
886     //   Val = getCopyFromReg(Chain_1)
887     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
888 
889     // getCopyToReg(Chain_0) will be glued together with
890     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
891     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
892     //   Data dependency from Unit B to Unit A due to usage of Val in
893     //     getCopyToReg(Chain_1, Val)
894     //   Chain dependency from Unit A to Unit B
895 
896     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
897     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
898                                      getPointerTy(MF.getDataLayout()));
899 
900     Register RetValReg
901         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
902           X86::RAX : X86::EAX;
903     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
904     Glue = Chain.getValue(1);
905 
906     // RAX/EAX now acts like a return value.
907     RetOps.push_back(
908         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
909 
910     // Add the returned register to the CalleeSaveDisableRegs list. Don't do
911     // this however for preserve_most/preserve_all to minimize the number of
912     // callee-saved registers for these CCs.
913     if (ShouldDisableCalleeSavedRegister &&
914         CallConv != CallingConv::PreserveAll &&
915         CallConv != CallingConv::PreserveMost)
916       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
917   }
918 
919   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
920   const MCPhysReg *I =
921       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
922   if (I) {
923     for (; *I; ++I) {
924       if (X86::GR64RegClass.contains(*I))
925         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
926       else
927         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
928     }
929   }
930 
931   RetOps[0] = Chain;  // Update chain.
932 
933   // Add the glue if we have it.
934   if (Glue.getNode())
935     RetOps.push_back(Glue);
936 
937   X86ISD::NodeType opcode = X86ISD::RET_GLUE;
938   if (CallConv == CallingConv::X86_INTR)
939     opcode = X86ISD::IRET;
940   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
941 }
942 
943 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
944   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
945     return false;
946 
947   SDValue TCChain = Chain;
948   SDNode *Copy = *N->use_begin();
949   if (Copy->getOpcode() == ISD::CopyToReg) {
950     // If the copy has a glue operand, we conservatively assume it isn't safe to
951     // perform a tail call.
952     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
953       return false;
954     TCChain = Copy->getOperand(0);
955   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
956     return false;
957 
958   bool HasRet = false;
959   for (const SDNode *U : Copy->uses()) {
960     if (U->getOpcode() != X86ISD::RET_GLUE)
961       return false;
962     // If we are returning more than one value, we can definitely
963     // not make a tail call see PR19530
964     if (U->getNumOperands() > 4)
965       return false;
966     if (U->getNumOperands() == 4 &&
967         U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
968       return false;
969     HasRet = true;
970   }
971 
972   if (!HasRet)
973     return false;
974 
975   Chain = TCChain;
976   return true;
977 }
978 
979 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
980                                            ISD::NodeType ExtendKind) const {
981   MVT ReturnMVT = MVT::i32;
982 
983   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
984   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
985     // The ABI does not require i1, i8 or i16 to be extended.
986     //
987     // On Darwin, there is code in the wild relying on Clang's old behaviour of
988     // always extending i8/i16 return values, so keep doing that for now.
989     // (PR26665).
990     ReturnMVT = MVT::i8;
991   }
992 
993   EVT MinVT = getRegisterType(Context, ReturnMVT);
994   return VT.bitsLT(MinVT) ? MinVT : VT;
995 }
996 
997 /// Reads two 32 bit registers and creates a 64 bit mask value.
998 /// \param VA The current 32 bit value that need to be assigned.
999 /// \param NextVA The next 32 bit value that need to be assigned.
1000 /// \param Root The parent DAG node.
1001 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1002 ///                        glue purposes. In the case the DAG is already using
1003 ///                        physical register instead of virtual, we should glue
1004 ///                        our new SDValue to InGlue SDvalue.
1005 /// \return a new SDvalue of size 64bit.
1006 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1007                                 SDValue &Root, SelectionDAG &DAG,
1008                                 const SDLoc &DL, const X86Subtarget &Subtarget,
1009                                 SDValue *InGlue = nullptr) {
1010   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1011   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1012   assert(VA.getValVT() == MVT::v64i1 &&
1013          "Expecting first location of 64 bit width type");
1014   assert(NextVA.getValVT() == VA.getValVT() &&
1015          "The locations should have the same type");
1016   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1017          "The values should reside in two registers");
1018 
1019   SDValue Lo, Hi;
1020   SDValue ArgValueLo, ArgValueHi;
1021 
1022   MachineFunction &MF = DAG.getMachineFunction();
1023   const TargetRegisterClass *RC = &X86::GR32RegClass;
1024 
1025   // Read a 32 bit value from the registers.
1026   if (nullptr == InGlue) {
1027     // When no physical register is present,
1028     // create an intermediate virtual register.
1029     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1030     ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1031     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
1032     ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1033   } else {
1034     // When a physical register is available read the value from it and glue
1035     // the reads together.
1036     ArgValueLo =
1037       DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue);
1038     *InGlue = ArgValueLo.getValue(2);
1039     ArgValueHi =
1040       DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue);
1041     *InGlue = ArgValueHi.getValue(2);
1042   }
1043 
1044   // Convert the i32 type into v32i1 type.
1045   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
1046 
1047   // Convert the i32 type into v32i1 type.
1048   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
1049 
1050   // Concatenate the two values together.
1051   return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi);
1052 }
1053 
1054 /// The function will lower a register of various sizes (8/16/32/64)
1055 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1056 /// \returns a DAG node contains the operand after lowering to mask type.
1057 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1058                                const EVT &ValLoc, const SDLoc &DL,
1059                                SelectionDAG &DAG) {
1060   SDValue ValReturned = ValArg;
1061 
1062   if (ValVT == MVT::v1i1)
1063     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned);
1064 
1065   if (ValVT == MVT::v64i1) {
1066     // In 32 bit machine, this case is handled by getv64i1Argument
1067     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1068     // In 64 bit machine, There is no need to truncate the value only bitcast
1069   } else {
1070     MVT MaskLenVT;
1071     switch (ValVT.getSimpleVT().SimpleTy) {
1072     case MVT::v8i1:
1073       MaskLenVT = MVT::i8;
1074       break;
1075     case MVT::v16i1:
1076       MaskLenVT = MVT::i16;
1077       break;
1078     case MVT::v32i1:
1079       MaskLenVT = MVT::i32;
1080       break;
1081     default:
1082       llvm_unreachable("Expecting a vector of i1 types");
1083     }
1084 
1085     ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned);
1086   }
1087   return DAG.getBitcast(ValVT, ValReturned);
1088 }
1089 
1090 /// Lower the result values of a call into the
1091 /// appropriate copies out of appropriate physical registers.
1092 ///
1093 SDValue X86TargetLowering::LowerCallResult(
1094     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1095     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1096     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1097     uint32_t *RegMask) const {
1098 
1099   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1100   // Assign locations to each value returned by this call.
1101   SmallVector<CCValAssign, 16> RVLocs;
1102   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1103                  *DAG.getContext());
1104   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1105 
1106   // Copy all of the result registers out of their specified physreg.
1107   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
1108        ++I, ++InsIndex) {
1109     CCValAssign &VA = RVLocs[I];
1110     EVT CopyVT = VA.getLocVT();
1111 
1112     // In some calling conventions we need to remove the used registers
1113     // from the register mask.
1114     if (RegMask) {
1115       for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
1116         RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
1117     }
1118 
1119     // Report an error if there was an attempt to return FP values via XMM
1120     // registers.
1121     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
1122       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
1123       if (VA.getLocReg() == X86::XMM1)
1124         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1125       else
1126         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1127     } else if (!Subtarget.hasSSE2() &&
1128                X86::FR64XRegClass.contains(VA.getLocReg()) &&
1129                CopyVT == MVT::f64) {
1130       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
1131       if (VA.getLocReg() == X86::XMM1)
1132         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1133       else
1134         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1135     }
1136 
1137     // If we prefer to use the value in xmm registers, copy it out as f80 and
1138     // use a truncate to move it from fp stack reg to xmm reg.
1139     bool RoundAfterCopy = false;
1140     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
1141         isScalarFPTypeInSSEReg(VA.getValVT())) {
1142       if (!Subtarget.hasX87())
1143         report_fatal_error("X87 register return with X87 disabled");
1144       CopyVT = MVT::f80;
1145       RoundAfterCopy = (CopyVT != VA.getLocVT());
1146     }
1147 
1148     SDValue Val;
1149     if (VA.needsCustom()) {
1150       assert(VA.getValVT() == MVT::v64i1 &&
1151              "Currently the only custom case is when we split v64i1 to 2 regs");
1152       Val =
1153           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
1154     } else {
1155       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1156                   .getValue(1);
1157       Val = Chain.getValue(0);
1158       InGlue = Chain.getValue(2);
1159     }
1160 
1161     if (RoundAfterCopy)
1162       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1163                         // This truncation won't change the value.
1164                         DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
1165 
1166     if (VA.isExtInLoc()) {
1167       if (VA.getValVT().isVector() &&
1168           VA.getValVT().getScalarType() == MVT::i1 &&
1169           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1170            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1171         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1172         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
1173       } else
1174         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1175     }
1176 
1177     if (VA.getLocInfo() == CCValAssign::BCvt)
1178       Val = DAG.getBitcast(VA.getValVT(), Val);
1179 
1180     InVals.push_back(Val);
1181   }
1182 
1183   return Chain;
1184 }
1185 
1186 //===----------------------------------------------------------------------===//
1187 //                C & StdCall & Fast Calling Convention implementation
1188 //===----------------------------------------------------------------------===//
1189 //  StdCall calling convention seems to be standard for many Windows' API
1190 //  routines and around. It differs from C calling convention just a little:
1191 //  callee should clean up the stack, not caller. Symbols should be also
1192 //  decorated in some fancy way :) It doesn't support any vector arguments.
1193 //  For info on fast calling convention see Fast Calling Convention (tail call)
1194 //  implementation LowerX86_32FastCCCallTo.
1195 
1196 /// Determines whether Args, either a set of outgoing arguments to a call, or a
1197 /// set of incoming args of a call, contains an sret pointer that the callee
1198 /// pops
1199 template <typename T>
1200 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1201                              const X86Subtarget &Subtarget) {
1202   // Not C++20 (yet), so no concepts available.
1203   static_assert(std::is_same_v<T, ISD::OutputArg> ||
1204                     std::is_same_v<T, ISD::InputArg>,
1205                 "requires ISD::OutputArg or ISD::InputArg");
1206 
1207   // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
1208   // for most compilations.
1209   if (!Subtarget.is32Bit())
1210     return false;
1211 
1212   if (Args.empty())
1213     return false;
1214 
1215   // Most calls do not have an sret argument, check the arg next.
1216   const ISD::ArgFlagsTy &Flags = Args[0].Flags;
1217   if (!Flags.isSRet() || Flags.isInReg())
1218     return false;
1219 
1220   // The MSVCabi does not pop the sret.
1221   if (Subtarget.getTargetTriple().isOSMSVCRT())
1222     return false;
1223 
1224   // MCUs don't pop the sret
1225   if (Subtarget.isTargetMCU())
1226     return false;
1227 
1228   // Callee pops argument
1229   return true;
1230 }
1231 
1232 /// Make a copy of an aggregate at address specified by "Src" to address
1233 /// "Dst" with size and alignment information specified by the specific
1234 /// parameter attribute. The copy will be passed as a byval function parameter.
1235 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1236                                          SDValue Chain, ISD::ArgFlagsTy Flags,
1237                                          SelectionDAG &DAG, const SDLoc &dl) {
1238   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
1239 
1240   return DAG.getMemcpy(
1241       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
1242       /*isVolatile*/ false, /*AlwaysInline=*/true,
1243       /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo());
1244 }
1245 
1246 /// Return true if the calling convention is one that we can guarantee TCO for.
1247 static bool canGuaranteeTCO(CallingConv::ID CC) {
1248   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1249           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
1250           CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
1251 }
1252 
1253 /// Return true if we might ever do TCO for calls with this calling convention.
1254 static bool mayTailCallThisCC(CallingConv::ID CC) {
1255   switch (CC) {
1256   // C calling conventions:
1257   case CallingConv::C:
1258   case CallingConv::Win64:
1259   case CallingConv::X86_64_SysV:
1260   case CallingConv::PreserveNone:
1261   // Callee pop conventions:
1262   case CallingConv::X86_ThisCall:
1263   case CallingConv::X86_StdCall:
1264   case CallingConv::X86_VectorCall:
1265   case CallingConv::X86_FastCall:
1266   // Swift:
1267   case CallingConv::Swift:
1268     return true;
1269   default:
1270     return canGuaranteeTCO(CC);
1271   }
1272 }
1273 
1274 /// Return true if the function is being made into a tailcall target by
1275 /// changing its ABI.
1276 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1277   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
1278          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
1279 }
1280 
1281 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1282   if (!CI->isTailCall())
1283     return false;
1284 
1285   CallingConv::ID CalleeCC = CI->getCallingConv();
1286   if (!mayTailCallThisCC(CalleeCC))
1287     return false;
1288 
1289   return true;
1290 }
1291 
1292 SDValue
1293 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1294                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1295                                     const SDLoc &dl, SelectionDAG &DAG,
1296                                     const CCValAssign &VA,
1297                                     MachineFrameInfo &MFI, unsigned i) const {
1298   // Create the nodes corresponding to a load from this parameter slot.
1299   ISD::ArgFlagsTy Flags = Ins[i].Flags;
1300   bool AlwaysUseMutable = shouldGuaranteeTCO(
1301       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
1302   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1303   EVT ValVT;
1304   MVT PtrVT = getPointerTy(DAG.getDataLayout());
1305 
1306   // If value is passed by pointer we have address passed instead of the value
1307   // itself. No need to extend if the mask value and location share the same
1308   // absolute size.
1309   bool ExtendedInMem =
1310       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1311       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1312 
1313   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
1314     ValVT = VA.getLocVT();
1315   else
1316     ValVT = VA.getValVT();
1317 
1318   // FIXME: For now, all byval parameter objects are marked mutable. This can be
1319   // changed with more analysis.
1320   // In case of tail call optimization mark all arguments mutable. Since they
1321   // could be overwritten by lowering of arguments in case of a tail call.
1322   if (Flags.isByVal()) {
1323     unsigned Bytes = Flags.getByValSize();
1324     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1325 
1326     // FIXME: For now, all byval parameter objects are marked as aliasing. This
1327     // can be improved with deeper analysis.
1328     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
1329                                    /*isAliased=*/true);
1330     return DAG.getFrameIndex(FI, PtrVT);
1331   }
1332 
1333   EVT ArgVT = Ins[i].ArgVT;
1334 
1335   // If this is a vector that has been split into multiple parts, don't elide
1336   // the copy. The layout on the stack may not match the packed in-memory
1337   // layout.
1338   bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1339 
1340   // This is an argument in memory. We might be able to perform copy elision.
1341   // If the argument is passed directly in memory without any extension, then we
1342   // can perform copy elision. Large vector types, for example, may be passed
1343   // indirectly by pointer.
1344   if (Flags.isCopyElisionCandidate() &&
1345       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1346       !ScalarizedVector) {
1347     SDValue PartAddr;
1348     if (Ins[i].PartOffset == 0) {
1349       // If this is a one-part value or the first part of a multi-part value,
1350       // create a stack object for the entire argument value type and return a
1351       // load from our portion of it. This assumes that if the first part of an
1352       // argument is in memory, the rest will also be in memory.
1353       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
1354                                      /*IsImmutable=*/false);
1355       PartAddr = DAG.getFrameIndex(FI, PtrVT);
1356       return DAG.getLoad(
1357           ValVT, dl, Chain, PartAddr,
1358           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
1359     }
1360 
1361     // This is not the first piece of an argument in memory. See if there is
1362     // already a fixed stack object including this offset. If so, assume it
1363     // was created by the PartOffset == 0 branch above and create a load from
1364     // the appropriate offset into it.
1365     int64_t PartBegin = VA.getLocMemOffset();
1366     int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
1367     int FI = MFI.getObjectIndexBegin();
1368     for (; MFI.isFixedObjectIndex(FI); ++FI) {
1369       int64_t ObjBegin = MFI.getObjectOffset(FI);
1370       int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
1371       if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1372         break;
1373     }
1374     if (MFI.isFixedObjectIndex(FI)) {
1375       SDValue Addr =
1376           DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
1377                       DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
1378       return DAG.getLoad(ValVT, dl, Chain, Addr,
1379                          MachinePointerInfo::getFixedStack(
1380                              DAG.getMachineFunction(), FI, Ins[i].PartOffset));
1381     }
1382   }
1383 
1384   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
1385                                  VA.getLocMemOffset(), isImmutable);
1386 
1387   // Set SExt or ZExt flag.
1388   if (VA.getLocInfo() == CCValAssign::ZExt) {
1389     MFI.setObjectZExt(FI, true);
1390   } else if (VA.getLocInfo() == CCValAssign::SExt) {
1391     MFI.setObjectSExt(FI, true);
1392   }
1393 
1394   MaybeAlign Alignment;
1395   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1396       ValVT != MVT::f80)
1397     Alignment = MaybeAlign(4);
1398   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1399   SDValue Val = DAG.getLoad(
1400       ValVT, dl, Chain, FIN,
1401       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1402       Alignment);
1403   return ExtendedInMem
1404              ? (VA.getValVT().isVector()
1405                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
1406                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
1407              : Val;
1408 }
1409 
1410 // FIXME: Get this from tablegen.
1411 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1412                                                 const X86Subtarget &Subtarget) {
1413   assert(Subtarget.is64Bit());
1414 
1415   if (Subtarget.isCallingConvWin64(CallConv)) {
1416     static const MCPhysReg GPR64ArgRegsWin64[] = {
1417       X86::RCX, X86::RDX, X86::R8,  X86::R9
1418     };
1419     return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
1420   }
1421 
1422   static const MCPhysReg GPR64ArgRegs64Bit[] = {
1423     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1424   };
1425   return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
1426 }
1427 
1428 // FIXME: Get this from tablegen.
1429 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1430                                                 CallingConv::ID CallConv,
1431                                                 const X86Subtarget &Subtarget) {
1432   assert(Subtarget.is64Bit());
1433   if (Subtarget.isCallingConvWin64(CallConv)) {
1434     // The XMM registers which might contain var arg parameters are shadowed
1435     // in their paired GPR.  So we only need to save the GPR to their home
1436     // slots.
1437     // TODO: __vectorcall will change this.
1438     return std::nullopt;
1439   }
1440 
1441   bool isSoftFloat = Subtarget.useSoftFloat();
1442   if (isSoftFloat || !Subtarget.hasSSE1())
1443     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
1444     // registers.
1445     return std::nullopt;
1446 
1447   static const MCPhysReg XMMArgRegs64Bit[] = {
1448     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1449     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1450   };
1451   return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
1452 }
1453 
1454 #ifndef NDEBUG
1455 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1456   return llvm::is_sorted(
1457       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1458         return A.getValNo() < B.getValNo();
1459       });
1460 }
1461 #endif
1462 
1463 namespace {
1464 /// This is a helper class for lowering variable arguments parameters.
1465 class VarArgsLoweringHelper {
1466 public:
1467   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
1468                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
1469                         CallingConv::ID CallConv, CCState &CCInfo)
1470       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1471         TheMachineFunction(DAG.getMachineFunction()),
1472         TheFunction(TheMachineFunction.getFunction()),
1473         FrameInfo(TheMachineFunction.getFrameInfo()),
1474         FrameLowering(*Subtarget.getFrameLowering()),
1475         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1476         CCInfo(CCInfo) {}
1477 
1478   // Lower variable arguments parameters.
1479   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1480 
1481 private:
1482   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1483 
1484   void forwardMustTailParameters(SDValue &Chain);
1485 
1486   bool is64Bit() const { return Subtarget.is64Bit(); }
1487   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
1488 
1489   X86MachineFunctionInfo *FuncInfo;
1490   const SDLoc &DL;
1491   SelectionDAG &DAG;
1492   const X86Subtarget &Subtarget;
1493   MachineFunction &TheMachineFunction;
1494   const Function &TheFunction;
1495   MachineFrameInfo &FrameInfo;
1496   const TargetFrameLowering &FrameLowering;
1497   const TargetLowering &TargLowering;
1498   CallingConv::ID CallConv;
1499   CCState &CCInfo;
1500 };
1501 } // namespace
1502 
1503 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1504     SDValue &Chain, unsigned StackSize) {
1505   // If the function takes variable number of arguments, make a frame index for
1506   // the start of the first vararg value... for expansion of llvm.va_start. We
1507   // can skip this if there are no va_start calls.
1508   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
1509                     CallConv != CallingConv::X86_ThisCall)) {
1510     FuncInfo->setVarArgsFrameIndex(
1511         FrameInfo.CreateFixedObject(1, StackSize, true));
1512   }
1513 
1514   // 64-bit calling conventions support varargs and register parameters, so we
1515   // have to do extra work to spill them in the prologue.
1516   if (is64Bit()) {
1517     // Find the first unallocated argument registers.
1518     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1519     ArrayRef<MCPhysReg> ArgXMMs =
1520         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
1521     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
1522     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
1523 
1524     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1525            "SSE register cannot be used when SSE is disabled!");
1526 
1527     if (isWin64()) {
1528       // Get to the caller-allocated home save location.  Add 8 to account
1529       // for the return address.
1530       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
1531       FuncInfo->setRegSaveFrameIndex(
1532           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
1533       // Fixup to set vararg frame on shadow area (4 x i64).
1534       if (NumIntRegs < 4)
1535         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1536     } else {
1537       // For X86-64, if there are vararg parameters that are passed via
1538       // registers, then we must store them to their spots on the stack so
1539       // they may be loaded by dereferencing the result of va_next.
1540       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1541       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
1542       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1543           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
1544     }
1545 
1546     SmallVector<SDValue, 6>
1547         LiveGPRs; // list of SDValue for GPR registers keeping live input value
1548     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
1549                                          // keeping live input value
1550     SDValue ALVal; // if applicable keeps SDValue for %al register
1551 
1552     // Gather all the live in physical registers.
1553     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
1554       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
1555       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
1556     }
1557     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
1558     if (!AvailableXmms.empty()) {
1559       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1560       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
1561       for (MCPhysReg Reg : AvailableXmms) {
1562         // FastRegisterAllocator spills virtual registers at basic
1563         // block boundary. That leads to usages of xmm registers
1564         // outside of check for %al. Pass physical registers to
1565         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1566         TheMachineFunction.getRegInfo().addLiveIn(Reg);
1567         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
1568       }
1569     }
1570 
1571     // Store the integer parameter registers.
1572     SmallVector<SDValue, 8> MemOps;
1573     SDValue RSFIN =
1574         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1575                           TargLowering.getPointerTy(DAG.getDataLayout()));
1576     unsigned Offset = FuncInfo->getVarArgsGPOffset();
1577     for (SDValue Val : LiveGPRs) {
1578       SDValue FIN = DAG.getNode(ISD::ADD, DL,
1579                                 TargLowering.getPointerTy(DAG.getDataLayout()),
1580                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
1581       SDValue Store =
1582           DAG.getStore(Val.getValue(1), DL, Val, FIN,
1583                        MachinePointerInfo::getFixedStack(
1584                            DAG.getMachineFunction(),
1585                            FuncInfo->getRegSaveFrameIndex(), Offset));
1586       MemOps.push_back(Store);
1587       Offset += 8;
1588     }
1589 
1590     // Now store the XMM (fp + vector) parameter registers.
1591     if (!LiveXMMRegs.empty()) {
1592       SmallVector<SDValue, 12> SaveXMMOps;
1593       SaveXMMOps.push_back(Chain);
1594       SaveXMMOps.push_back(ALVal);
1595       SaveXMMOps.push_back(RSFIN);
1596       SaveXMMOps.push_back(
1597           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
1598       llvm::append_range(SaveXMMOps, LiveXMMRegs);
1599       MachineMemOperand *StoreMMO =
1600           DAG.getMachineFunction().getMachineMemOperand(
1601               MachinePointerInfo::getFixedStack(
1602                   DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
1603                   Offset),
1604               MachineMemOperand::MOStore, 128, Align(16));
1605       MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
1606                                                DL, DAG.getVTList(MVT::Other),
1607                                                SaveXMMOps, MVT::i8, StoreMMO));
1608     }
1609 
1610     if (!MemOps.empty())
1611       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
1612   }
1613 }
1614 
1615 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1616   // Find the largest legal vector type.
1617   MVT VecVT = MVT::Other;
1618   // FIXME: Only some x86_32 calling conventions support AVX512.
1619   if (Subtarget.useAVX512Regs() &&
1620       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
1621                      CallConv == CallingConv::Intel_OCL_BI)))
1622     VecVT = MVT::v16f32;
1623   else if (Subtarget.hasAVX())
1624     VecVT = MVT::v8f32;
1625   else if (Subtarget.hasSSE2())
1626     VecVT = MVT::v4f32;
1627 
1628   // We forward some GPRs and some vector types.
1629   SmallVector<MVT, 2> RegParmTypes;
1630   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1631   RegParmTypes.push_back(IntVT);
1632   if (VecVT != MVT::Other)
1633     RegParmTypes.push_back(VecVT);
1634 
1635   // Compute the set of forwarded registers. The rest are scratch.
1636   SmallVectorImpl<ForwardedRegister> &Forwards =
1637       FuncInfo->getForwardedMustTailRegParms();
1638   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
1639 
1640   // Forward AL for SysV x86_64 targets, since it is used for varargs.
1641   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
1642     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1643     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
1644   }
1645 
1646   // Copy all forwards from physical to virtual registers.
1647   for (ForwardedRegister &FR : Forwards) {
1648     // FIXME: Can we use a less constrained schedule?
1649     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
1650     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1651         TargLowering.getRegClassFor(FR.VT));
1652     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
1653   }
1654 }
1655 
1656 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1657                                                    unsigned StackSize) {
1658   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1659   // If necessary, it would be set into the correct value later.
1660   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1661   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1662 
1663   if (FrameInfo.hasVAStart())
1664     createVarArgAreaAndStoreRegisters(Chain, StackSize);
1665 
1666   if (FrameInfo.hasMustTailInVarArgFunc())
1667     forwardMustTailParameters(Chain);
1668 }
1669 
1670 SDValue X86TargetLowering::LowerFormalArguments(
1671     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1672     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1673     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1674   MachineFunction &MF = DAG.getMachineFunction();
1675   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1676 
1677   const Function &F = MF.getFunction();
1678   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1679       F.getName() == "main")
1680     FuncInfo->setForceFramePointer(true);
1681 
1682   MachineFrameInfo &MFI = MF.getFrameInfo();
1683   bool Is64Bit = Subtarget.is64Bit();
1684   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
1685 
1686   assert(
1687       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
1688       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1689 
1690   // Assign locations to all of the incoming arguments.
1691   SmallVector<CCValAssign, 16> ArgLocs;
1692   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1693 
1694   // Allocate shadow area for Win64.
1695   if (IsWin64)
1696     CCInfo.AllocateStack(32, Align(8));
1697 
1698   CCInfo.AnalyzeArguments(Ins, CC_X86);
1699 
1700   // In vectorcall calling convention a second pass is required for the HVA
1701   // types.
1702   if (CallingConv::X86_VectorCall == CallConv) {
1703     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
1704   }
1705 
1706   // The next loop assumes that the locations are in the same order of the
1707   // input arguments.
1708   assert(isSortedByValueNo(ArgLocs) &&
1709          "Argument Location list must be sorted before lowering");
1710 
1711   SDValue ArgValue;
1712   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
1713        ++I, ++InsIndex) {
1714     assert(InsIndex < Ins.size() && "Invalid Ins index");
1715     CCValAssign &VA = ArgLocs[I];
1716 
1717     if (VA.isRegLoc()) {
1718       EVT RegVT = VA.getLocVT();
1719       if (VA.needsCustom()) {
1720         assert(
1721             VA.getValVT() == MVT::v64i1 &&
1722             "Currently the only custom case is when we split v64i1 to 2 regs");
1723 
1724         // v64i1 values, in regcall calling convention, that are
1725         // compiled to 32 bit arch, are split up into two registers.
1726         ArgValue =
1727             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
1728       } else {
1729         const TargetRegisterClass *RC;
1730         if (RegVT == MVT::i8)
1731           RC = &X86::GR8RegClass;
1732         else if (RegVT == MVT::i16)
1733           RC = &X86::GR16RegClass;
1734         else if (RegVT == MVT::i32)
1735           RC = &X86::GR32RegClass;
1736         else if (Is64Bit && RegVT == MVT::i64)
1737           RC = &X86::GR64RegClass;
1738         else if (RegVT == MVT::f16)
1739           RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1740         else if (RegVT == MVT::f32)
1741           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1742         else if (RegVT == MVT::f64)
1743           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1744         else if (RegVT == MVT::f80)
1745           RC = &X86::RFP80RegClass;
1746         else if (RegVT == MVT::f128)
1747           RC = &X86::VR128RegClass;
1748         else if (RegVT.is512BitVector())
1749           RC = &X86::VR512RegClass;
1750         else if (RegVT.is256BitVector())
1751           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1752         else if (RegVT.is128BitVector())
1753           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1754         else if (RegVT == MVT::x86mmx)
1755           RC = &X86::VR64RegClass;
1756         else if (RegVT == MVT::v1i1)
1757           RC = &X86::VK1RegClass;
1758         else if (RegVT == MVT::v8i1)
1759           RC = &X86::VK8RegClass;
1760         else if (RegVT == MVT::v16i1)
1761           RC = &X86::VK16RegClass;
1762         else if (RegVT == MVT::v32i1)
1763           RC = &X86::VK32RegClass;
1764         else if (RegVT == MVT::v64i1)
1765           RC = &X86::VK64RegClass;
1766         else
1767           llvm_unreachable("Unknown argument type!");
1768 
1769         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1770         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1771       }
1772 
1773       // If this is an 8 or 16-bit value, it is really passed promoted to 32
1774       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1775       // right size.
1776       if (VA.getLocInfo() == CCValAssign::SExt)
1777         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1778                                DAG.getValueType(VA.getValVT()));
1779       else if (VA.getLocInfo() == CCValAssign::ZExt)
1780         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1781                                DAG.getValueType(VA.getValVT()));
1782       else if (VA.getLocInfo() == CCValAssign::BCvt)
1783         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
1784 
1785       if (VA.isExtInLoc()) {
1786         // Handle MMX values passed in XMM regs.
1787         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1788           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
1789         else if (VA.getValVT().isVector() &&
1790                  VA.getValVT().getScalarType() == MVT::i1 &&
1791                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1792                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1793           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1794           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
1795         } else
1796           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1797       }
1798     } else {
1799       assert(VA.isMemLoc());
1800       ArgValue =
1801           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
1802     }
1803 
1804     // If value is passed via pointer - do a load.
1805     if (VA.getLocInfo() == CCValAssign::Indirect &&
1806         !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
1807       ArgValue =
1808           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
1809     }
1810 
1811     InVals.push_back(ArgValue);
1812   }
1813 
1814   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
1815     if (Ins[I].Flags.isSwiftAsync()) {
1816       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1817       if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF))
1818         X86FI->setHasSwiftAsyncContext(true);
1819       else {
1820         int PtrSize = Subtarget.is64Bit() ? 8 : 4;
1821         int FI =
1822             MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize), false);
1823         X86FI->setSwiftAsyncContextFrameIdx(FI);
1824         SDValue St = DAG.getStore(
1825             DAG.getEntryNode(), dl, InVals[I],
1826             DAG.getFrameIndex(FI, PtrSize == 8 ? MVT::i64 : MVT::i32),
1827             MachinePointerInfo::getFixedStack(MF, FI));
1828         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
1829       }
1830     }
1831 
1832     // Swift calling convention does not require we copy the sret argument
1833     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1834     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
1835       continue;
1836 
1837     // All x86 ABIs require that for returning structs by value we copy the
1838     // sret argument into %rax/%eax (depending on ABI) for the return. Save
1839     // the argument into a virtual register so that we can access it from the
1840     // return points.
1841     if (Ins[I].Flags.isSRet()) {
1842       assert(!FuncInfo->getSRetReturnReg() &&
1843              "SRet return has already been set");
1844       MVT PtrTy = getPointerTy(DAG.getDataLayout());
1845       Register Reg =
1846           MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
1847       FuncInfo->setSRetReturnReg(Reg);
1848       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
1849       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1850       break;
1851     }
1852   }
1853 
1854   unsigned StackSize = CCInfo.getStackSize();
1855   // Align stack specially for tail calls.
1856   if (shouldGuaranteeTCO(CallConv,
1857                          MF.getTarget().Options.GuaranteedTailCallOpt))
1858     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1859 
1860   if (IsVarArg)
1861     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1862         .lowerVarArgsParameters(Chain, StackSize);
1863 
1864   // Some CCs need callee pop.
1865   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
1866                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
1867     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1868   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
1869     // X86 interrupts must pop the error code (and the alignment padding) if
1870     // present.
1871     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
1872   } else {
1873     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1874     // If this is an sret function, the return should pop the hidden pointer.
1875     if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
1876       FuncInfo->setBytesToPopOnReturn(4);
1877   }
1878 
1879   if (!Is64Bit) {
1880     // RegSaveFrameIndex is X86-64 only.
1881     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1882   }
1883 
1884   FuncInfo->setArgumentStackSize(StackSize);
1885 
1886   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1887     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
1888     if (Personality == EHPersonality::CoreCLR) {
1889       assert(Is64Bit);
1890       // TODO: Add a mechanism to frame lowering that will allow us to indicate
1891       // that we'd prefer this slot be allocated towards the bottom of the frame
1892       // (i.e. near the stack pointer after allocating the frame).  Every
1893       // funclet needs a copy of this slot in its (mostly empty) frame, and the
1894       // offset from the bottom of this and each funclet's frame must be the
1895       // same, so the size of funclets' (mostly empty) frames is dictated by
1896       // how far this slot is from the bottom (since they allocate just enough
1897       // space to accommodate holding this slot at the correct offset).
1898       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
1899       EHInfo->PSPSymFrameIdx = PSPSymFI;
1900     }
1901   }
1902 
1903   if (shouldDisableArgRegFromCSR(CallConv) ||
1904       F.hasFnAttribute("no_caller_saved_registers")) {
1905     MachineRegisterInfo &MRI = MF.getRegInfo();
1906     for (std::pair<Register, Register> Pair : MRI.liveins())
1907       MRI.disableCalleeSavedRegister(Pair.first);
1908   }
1909 
1910   if (CallingConv::PreserveNone == CallConv)
1911     for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
1912       if (Ins[I].Flags.isSwiftSelf() || Ins[I].Flags.isSwiftAsync() ||
1913           Ins[I].Flags.isSwiftError()) {
1914         errorUnsupported(DAG, dl,
1915                          "Swift attributes can't be used with preserve_none");
1916         break;
1917       }
1918     }
1919 
1920   return Chain;
1921 }
1922 
1923 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1924                                             SDValue Arg, const SDLoc &dl,
1925                                             SelectionDAG &DAG,
1926                                             const CCValAssign &VA,
1927                                             ISD::ArgFlagsTy Flags,
1928                                             bool isByVal) const {
1929   unsigned LocMemOffset = VA.getLocMemOffset();
1930   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1931   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1932                        StackPtr, PtrOff);
1933   if (isByVal)
1934     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1935 
1936   MaybeAlign Alignment;
1937   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1938       Arg.getSimpleValueType() != MVT::f80)
1939     Alignment = MaybeAlign(4);
1940   return DAG.getStore(
1941       Chain, dl, Arg, PtrOff,
1942       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
1943       Alignment);
1944 }
1945 
1946 /// Emit a load of return address if tail call
1947 /// optimization is performed and it is required.
1948 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1949     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1950     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1951   // Adjust the Return address stack slot.
1952   EVT VT = getPointerTy(DAG.getDataLayout());
1953   OutRetAddr = getReturnAddressFrameIndex(DAG);
1954 
1955   // Load the "old" Return address.
1956   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
1957   return SDValue(OutRetAddr.getNode(), 1);
1958 }
1959 
1960 /// Emit a store of the return address if tail call
1961 /// optimization is performed and it is required (FPDiff!=0).
1962 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1963                                         SDValue Chain, SDValue RetAddrFrIdx,
1964                                         EVT PtrVT, unsigned SlotSize,
1965                                         int FPDiff, const SDLoc &dl) {
1966   // Store the return address to the appropriate stack slot.
1967   if (!FPDiff) return Chain;
1968   // Calculate the new stack slot for the return address.
1969   int NewReturnAddrFI =
1970     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
1971                                          false);
1972   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
1973   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1974                        MachinePointerInfo::getFixedStack(
1975                            DAG.getMachineFunction(), NewReturnAddrFI));
1976   return Chain;
1977 }
1978 
1979 /// Returns a vector_shuffle mask for an movs{s|d}, movd
1980 /// operation of specified width.
1981 SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
1982                                    SDValue V1, SDValue V2) const {
1983   unsigned NumElems = VT.getVectorNumElements();
1984   SmallVector<int, 8> Mask;
1985   Mask.push_back(NumElems);
1986   for (unsigned i = 1; i != NumElems; ++i)
1987     Mask.push_back(i);
1988   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
1989 }
1990 
1991 SDValue
1992 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1993                              SmallVectorImpl<SDValue> &InVals) const {
1994   SelectionDAG &DAG                     = CLI.DAG;
1995   SDLoc &dl                             = CLI.DL;
1996   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1997   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1998   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1999   SDValue Chain                         = CLI.Chain;
2000   SDValue Callee                        = CLI.Callee;
2001   CallingConv::ID CallConv              = CLI.CallConv;
2002   bool &isTailCall                      = CLI.IsTailCall;
2003   bool isVarArg                         = CLI.IsVarArg;
2004   const auto *CB                        = CLI.CB;
2005 
2006   MachineFunction &MF = DAG.getMachineFunction();
2007   bool Is64Bit        = Subtarget.is64Bit();
2008   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
2009   bool IsSibcall      = false;
2010   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
2011       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
2012   bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
2013   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2014   bool HasNCSR = (CB && isa<CallInst>(CB) &&
2015                   CB->hasFnAttr("no_caller_saved_registers"));
2016   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
2017   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
2018   bool IsCFICall = IsIndirectCall && CLI.CFIType;
2019   const Module *M = MF.getFunction().getParent();
2020   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
2021 
2022   MachineFunction::CallSiteInfo CSInfo;
2023   if (CallConv == CallingConv::X86_INTR)
2024     report_fatal_error("X86 interrupts may not be called directly");
2025 
2026   // Analyze operands of the call, assigning locations to each operand.
2027   SmallVector<CCValAssign, 16> ArgLocs;
2028   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2029 
2030   // Allocate shadow area for Win64.
2031   if (IsWin64)
2032     CCInfo.AllocateStack(32, Align(8));
2033 
2034   CCInfo.AnalyzeArguments(Outs, CC_X86);
2035 
2036   // In vectorcall calling convention a second pass is required for the HVA
2037   // types.
2038   if (CallingConv::X86_VectorCall == CallConv) {
2039     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
2040   }
2041 
2042   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2043   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
2044     // If we are using a GOT, disable tail calls to external symbols with
2045     // default visibility. Tail calling such a symbol requires using a GOT
2046     // relocation, which forces early binding of the symbol. This breaks code
2047     // that require lazy function symbol resolution. Using musttail or
2048     // GuaranteedTailCallOpt will override this.
2049     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2050     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2051                G->getGlobal()->hasDefaultVisibility()))
2052       isTailCall = false;
2053   }
2054 
2055   if (isTailCall && !IsMustTail) {
2056     // Check if it's really possible to do a tail call.
2057     isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
2058                                                    IsCalleePopSRet);
2059 
2060     // Sibcalls are automatically detected tailcalls which do not require
2061     // ABI changes.
2062     if (!IsGuaranteeTCO && isTailCall)
2063       IsSibcall = true;
2064 
2065     if (isTailCall)
2066       ++NumTailCalls;
2067   }
2068 
2069   if (IsMustTail && !isTailCall)
2070     report_fatal_error("failed to perform tail call elimination on a call "
2071                        "site marked musttail");
2072 
2073   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2074          "Var args not supported with calling convention fastcc, ghc or hipe");
2075 
2076   // Get a count of how many bytes are to be pushed on the stack.
2077   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2078   if (IsSibcall)
2079     // This is a sibcall. The memory operands are available in caller's
2080     // own caller's stack.
2081     NumBytes = 0;
2082   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
2083     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2084 
2085   int FPDiff = 0;
2086   if (isTailCall &&
2087       shouldGuaranteeTCO(CallConv,
2088                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
2089     // Lower arguments at fp - stackoffset + fpdiff.
2090     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2091 
2092     FPDiff = NumBytesCallerPushed - NumBytes;
2093 
2094     // Set the delta of movement of the returnaddr stackslot.
2095     // But only set if delta is greater than previous delta.
2096     if (FPDiff < X86Info->getTCReturnAddrDelta())
2097       X86Info->setTCReturnAddrDelta(FPDiff);
2098   }
2099 
2100   unsigned NumBytesToPush = NumBytes;
2101   unsigned NumBytesToPop = NumBytes;
2102 
2103   // If we have an inalloca argument, all stack space has already been allocated
2104   // for us and be right at the top of the stack.  We don't support multiple
2105   // arguments passed in memory when using inalloca.
2106   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2107     NumBytesToPush = 0;
2108     if (!ArgLocs.back().isMemLoc())
2109       report_fatal_error("cannot use inalloca attribute on a register "
2110                          "parameter");
2111     if (ArgLocs.back().getLocMemOffset() != 0)
2112       report_fatal_error("any parameter with the inalloca attribute must be "
2113                          "the only memory argument");
2114   } else if (CLI.IsPreallocated) {
2115     assert(ArgLocs.back().isMemLoc() &&
2116            "cannot use preallocated attribute on a register "
2117            "parameter");
2118     SmallVector<size_t, 4> PreallocatedOffsets;
2119     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
2120       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
2121         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
2122       }
2123     }
2124     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2125     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
2126     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
2127     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
2128     NumBytesToPush = 0;
2129   }
2130 
2131   if (!IsSibcall && !IsMustTail)
2132     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
2133                                  NumBytes - NumBytesToPush, dl);
2134 
2135   SDValue RetAddrFrIdx;
2136   // Load return address for tail calls.
2137   if (isTailCall && FPDiff)
2138     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2139                                     Is64Bit, FPDiff, dl);
2140 
2141   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
2142   SmallVector<SDValue, 8> MemOpChains;
2143   SDValue StackPtr;
2144 
2145   // The next loop assumes that the locations are in the same order of the
2146   // input arguments.
2147   assert(isSortedByValueNo(ArgLocs) &&
2148          "Argument Location list must be sorted before lowering");
2149 
2150   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2151   // of tail call optimization arguments are handle later.
2152   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2153   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
2154        ++I, ++OutIndex) {
2155     assert(OutIndex < Outs.size() && "Invalid Out index");
2156     // Skip inalloca/preallocated arguments, they have already been written.
2157     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
2158     if (Flags.isInAlloca() || Flags.isPreallocated())
2159       continue;
2160 
2161     CCValAssign &VA = ArgLocs[I];
2162     EVT RegVT = VA.getLocVT();
2163     SDValue Arg = OutVals[OutIndex];
2164     bool isByVal = Flags.isByVal();
2165 
2166     // Promote the value if needed.
2167     switch (VA.getLocInfo()) {
2168     default: llvm_unreachable("Unknown loc info!");
2169     case CCValAssign::Full: break;
2170     case CCValAssign::SExt:
2171       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2172       break;
2173     case CCValAssign::ZExt:
2174       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2175       break;
2176     case CCValAssign::AExt:
2177       if (Arg.getValueType().isVector() &&
2178           Arg.getValueType().getVectorElementType() == MVT::i1)
2179         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
2180       else if (RegVT.is128BitVector()) {
2181         // Special case: passing MMX values in XMM registers.
2182         Arg = DAG.getBitcast(MVT::i64, Arg);
2183         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2184         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2185       } else
2186         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2187       break;
2188     case CCValAssign::BCvt:
2189       Arg = DAG.getBitcast(RegVT, Arg);
2190       break;
2191     case CCValAssign::Indirect: {
2192       if (isByVal) {
2193         // Memcpy the argument to a temporary stack slot to prevent
2194         // the caller from seeing any modifications the callee may make
2195         // as guaranteed by the `byval` attribute.
2196         int FrameIdx = MF.getFrameInfo().CreateStackObject(
2197             Flags.getByValSize(),
2198             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
2199         SDValue StackSlot =
2200             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
2201         Chain =
2202             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
2203         // From now on treat this as a regular pointer
2204         Arg = StackSlot;
2205         isByVal = false;
2206       } else {
2207         // Store the argument.
2208         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2209         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2210         Chain = DAG.getStore(
2211             Chain, dl, Arg, SpillSlot,
2212             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2213         Arg = SpillSlot;
2214       }
2215       break;
2216     }
2217     }
2218 
2219     if (VA.needsCustom()) {
2220       assert(VA.getValVT() == MVT::v64i1 &&
2221              "Currently the only custom case is when we split v64i1 to 2 regs");
2222       // Split v64i1 value into two registers
2223       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
2224     } else if (VA.isRegLoc()) {
2225       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2226       const TargetOptions &Options = DAG.getTarget().Options;
2227       if (Options.EmitCallSiteInfo)
2228         CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), I);
2229       if (isVarArg && IsWin64) {
2230         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2231         // shadow reg if callee is a varargs function.
2232         Register ShadowReg;
2233         switch (VA.getLocReg()) {
2234         case X86::XMM0: ShadowReg = X86::RCX; break;
2235         case X86::XMM1: ShadowReg = X86::RDX; break;
2236         case X86::XMM2: ShadowReg = X86::R8; break;
2237         case X86::XMM3: ShadowReg = X86::R9; break;
2238         }
2239         if (ShadowReg)
2240           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2241       }
2242     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2243       assert(VA.isMemLoc());
2244       if (!StackPtr.getNode())
2245         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2246                                       getPointerTy(DAG.getDataLayout()));
2247       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2248                                              dl, DAG, VA, Flags, isByVal));
2249     }
2250   }
2251 
2252   if (!MemOpChains.empty())
2253     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2254 
2255   if (Subtarget.isPICStyleGOT()) {
2256     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2257     // GOT pointer (except regcall).
2258     if (!isTailCall) {
2259       // Indirect call with RegCall calling convertion may use up all the
2260       // general registers, so it is not suitable to bind EBX reister for
2261       // GOT address, just let register allocator handle it.
2262       if (CallConv != CallingConv::X86_RegCall)
2263         RegsToPass.push_back(std::make_pair(
2264           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2265                                           getPointerTy(DAG.getDataLayout()))));
2266     } else {
2267       // If we are tail calling and generating PIC/GOT style code load the
2268       // address of the callee into ECX. The value in ecx is used as target of
2269       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2270       // for tail calls on PIC/GOT architectures. Normally we would just put the
2271       // address of GOT into ebx and then call target@PLT. But for tail calls
2272       // ebx would be restored (since ebx is callee saved) before jumping to the
2273       // target@PLT.
2274 
2275       // Note: The actual moving to ECX is done further down.
2276       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2277       if (G && !G->getGlobal()->hasLocalLinkage() &&
2278           G->getGlobal()->hasDefaultVisibility())
2279         Callee = LowerGlobalAddress(Callee, DAG);
2280       else if (isa<ExternalSymbolSDNode>(Callee))
2281         Callee = LowerExternalSymbol(Callee, DAG);
2282     }
2283   }
2284 
2285   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2286       (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
2287     // From AMD64 ABI document:
2288     // For calls that may call functions that use varargs or stdargs
2289     // (prototype-less calls or calls to functions containing ellipsis (...) in
2290     // the declaration) %al is used as hidden argument to specify the number
2291     // of SSE registers used. The contents of %al do not need to match exactly
2292     // the number of registers, but must be an ubound on the number of SSE
2293     // registers used and is in the range 0 - 8 inclusive.
2294 
2295     // Count the number of XMM registers allocated.
2296     static const MCPhysReg XMMArgRegs[] = {
2297       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2298       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2299     };
2300     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
2301     assert((Subtarget.hasSSE1() || !NumXMMRegs)
2302            && "SSE registers cannot be used when SSE is disabled");
2303     RegsToPass.push_back(std::make_pair(Register(X86::AL),
2304                                         DAG.getConstant(NumXMMRegs, dl,
2305                                                         MVT::i8)));
2306   }
2307 
2308   if (isVarArg && IsMustTail) {
2309     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2310     for (const auto &F : Forwards) {
2311       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2312       RegsToPass.push_back(std::make_pair(F.PReg, Val));
2313     }
2314   }
2315 
2316   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
2317   // don't need this because the eligibility check rejects calls that require
2318   // shuffling arguments passed in memory.
2319   if (!IsSibcall && isTailCall) {
2320     // Force all the incoming stack arguments to be loaded from the stack
2321     // before any new outgoing arguments are stored to the stack, because the
2322     // outgoing stack slots may alias the incoming argument stack slots, and
2323     // the alias isn't otherwise explicit. This is slightly more conservative
2324     // than necessary, because it means that each store effectively depends
2325     // on every argument instead of just those arguments it would clobber.
2326     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2327 
2328     SmallVector<SDValue, 8> MemOpChains2;
2329     SDValue FIN;
2330     int FI = 0;
2331     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
2332          ++I, ++OutsIndex) {
2333       CCValAssign &VA = ArgLocs[I];
2334 
2335       if (VA.isRegLoc()) {
2336         if (VA.needsCustom()) {
2337           assert((CallConv == CallingConv::X86_RegCall) &&
2338                  "Expecting custom case only in regcall calling convention");
2339           // This means that we are in special case where one argument was
2340           // passed through two register locations - Skip the next location
2341           ++I;
2342         }
2343 
2344         continue;
2345       }
2346 
2347       assert(VA.isMemLoc());
2348       SDValue Arg = OutVals[OutsIndex];
2349       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
2350       // Skip inalloca/preallocated arguments.  They don't require any work.
2351       if (Flags.isInAlloca() || Flags.isPreallocated())
2352         continue;
2353       // Create frame index.
2354       int32_t Offset = VA.getLocMemOffset()+FPDiff;
2355       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2356       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
2357       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2358 
2359       if (Flags.isByVal()) {
2360         // Copy relative to framepointer.
2361         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
2362         if (!StackPtr.getNode())
2363           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2364                                         getPointerTy(DAG.getDataLayout()));
2365         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2366                              StackPtr, Source);
2367 
2368         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2369                                                          ArgChain,
2370                                                          Flags, DAG, dl));
2371       } else {
2372         // Store relative to framepointer.
2373         MemOpChains2.push_back(DAG.getStore(
2374             ArgChain, dl, Arg, FIN,
2375             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
2376       }
2377     }
2378 
2379     if (!MemOpChains2.empty())
2380       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
2381 
2382     // Store the return address to the appropriate stack slot.
2383     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2384                                      getPointerTy(DAG.getDataLayout()),
2385                                      RegInfo->getSlotSize(), FPDiff, dl);
2386   }
2387 
2388   // Build a sequence of copy-to-reg nodes chained together with token chain
2389   // and glue operands which copy the outgoing args into registers.
2390   SDValue InGlue;
2391   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2392     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2393                              RegsToPass[i].second, InGlue);
2394     InGlue = Chain.getValue(1);
2395   }
2396 
2397   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2398     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2399     // In the 64-bit large code model, we have to make all calls
2400     // through a register, since the call instruction's 32-bit
2401     // pc-relative offset may not be large enough to hold the whole
2402     // address.
2403   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
2404              Callee->getOpcode() == ISD::ExternalSymbol) {
2405     // Lower direct calls to global addresses and external symbols. Setting
2406     // ForCall to true here has the effect of removing WrapperRIP when possible
2407     // to allow direct calls to be selected without first materializing the
2408     // address into a register.
2409     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
2410   } else if (Subtarget.isTarget64BitILP32() &&
2411              Callee.getValueType() == MVT::i32) {
2412     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2413     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
2414   }
2415 
2416   // Returns a chain & a glue for retval copy to use.
2417   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2418   SmallVector<SDValue, 8> Ops;
2419 
2420   if (!IsSibcall && isTailCall && !IsMustTail) {
2421     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
2422     InGlue = Chain.getValue(1);
2423   }
2424 
2425   Ops.push_back(Chain);
2426   Ops.push_back(Callee);
2427 
2428   if (isTailCall)
2429     Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
2430 
2431   // Add argument registers to the end of the list so that they are known live
2432   // into the call.
2433   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2434     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2435                                   RegsToPass[i].second.getValueType()));
2436 
2437   // Add a register mask operand representing the call-preserved registers.
2438   const uint32_t *Mask = [&]() {
2439     auto AdaptedCC = CallConv;
2440     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2441     // use X86_INTR calling convention because it has the same CSR mask
2442     // (same preserved registers).
2443     if (HasNCSR)
2444       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2445     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
2446     // to use the CSR_NoRegs_RegMask.
2447     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
2448       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2449     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2450   }();
2451   assert(Mask && "Missing call preserved mask for calling convention");
2452 
2453   // If this is an invoke in a 32-bit function using a funclet-based
2454   // personality, assume the function clobbers all registers. If an exception
2455   // is thrown, the runtime will not restore CSRs.
2456   // FIXME: Model this more precisely so that we can register allocate across
2457   // the normal edge and spill and fill across the exceptional edge.
2458   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
2459     const Function &CallerFn = MF.getFunction();
2460     EHPersonality Pers =
2461         CallerFn.hasPersonalityFn()
2462             ? classifyEHPersonality(CallerFn.getPersonalityFn())
2463             : EHPersonality::Unknown;
2464     if (isFuncletEHPersonality(Pers))
2465       Mask = RegInfo->getNoPreservedMask();
2466   }
2467 
2468   // Define a new register mask from the existing mask.
2469   uint32_t *RegMask = nullptr;
2470 
2471   // In some calling conventions we need to remove the used physical registers
2472   // from the reg mask. Create a new RegMask for such calling conventions.
2473   // RegMask for calling conventions that disable only return registers (e.g.
2474   // preserve_most) will be modified later in LowerCallResult.
2475   bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
2476   if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
2477     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2478 
2479     // Allocate a new Reg Mask and copy Mask.
2480     RegMask = MF.allocateRegMask();
2481     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
2482     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
2483 
2484     // Make sure all sub registers of the argument registers are reset
2485     // in the RegMask.
2486     if (ShouldDisableArgRegs) {
2487       for (auto const &RegPair : RegsToPass)
2488         for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
2489           RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
2490     }
2491 
2492     // Create the RegMask Operand according to our updated mask.
2493     Ops.push_back(DAG.getRegisterMask(RegMask));
2494   } else {
2495     // Create the RegMask Operand according to the static mask.
2496     Ops.push_back(DAG.getRegisterMask(Mask));
2497   }
2498 
2499   if (InGlue.getNode())
2500     Ops.push_back(InGlue);
2501 
2502   if (isTailCall) {
2503     // We used to do:
2504     //// If this is the first return lowered for this function, add the regs
2505     //// to the liveout set for the function.
2506     // This isn't right, although it's probably harmless on x86; liveouts
2507     // should be computed from returns not tail calls.  Consider a void
2508     // function making a tail call to a function returning int.
2509     MF.getFrameInfo().setHasTailCall();
2510     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
2511 
2512     if (IsCFICall)
2513       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2514 
2515     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2516     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2517     return Ret;
2518   }
2519 
2520   if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
2521     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
2522   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
2523     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
2524     // expanded to the call, directly followed by a special marker sequence and
2525     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2526     assert(!isTailCall &&
2527            "tail calls cannot be marked with clang.arc.attachedcall");
2528     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2529 
2530     // Add a target global address for the retainRV/claimRV runtime function
2531     // just before the call target.
2532     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
2533     auto PtrVT = getPointerTy(DAG.getDataLayout());
2534     auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
2535     Ops.insert(Ops.begin() + 1, GA);
2536     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
2537   } else {
2538     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
2539   }
2540 
2541   if (IsCFICall)
2542     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2543 
2544   InGlue = Chain.getValue(1);
2545   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2546   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2547 
2548   // Save heapallocsite metadata.
2549   if (CLI.CB)
2550     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
2551       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
2552 
2553   // Create the CALLSEQ_END node.
2554   unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
2555   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2556                        DAG.getTarget().Options.GuaranteedTailCallOpt))
2557     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
2558   else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
2559     // If this call passes a struct-return pointer, the callee
2560     // pops that struct pointer.
2561     NumBytesForCalleeToPop = 4;
2562 
2563   // Returns a glue for retval copy to use.
2564   if (!IsSibcall) {
2565     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
2566                                InGlue, dl);
2567     InGlue = Chain.getValue(1);
2568   }
2569 
2570   if (CallingConv::PreserveNone == CallConv)
2571     for (unsigned I = 0, E = Outs.size(); I != E; ++I) {
2572       if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftAsync() ||
2573           Outs[I].Flags.isSwiftError()) {
2574         errorUnsupported(DAG, dl,
2575                          "Swift attributes can't be used with preserve_none");
2576         break;
2577       }
2578     }
2579 
2580   // Handle result values, copying them out of physregs into vregs that we
2581   // return.
2582   return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2583                          InVals, RegMask);
2584 }
2585 
2586 //===----------------------------------------------------------------------===//
2587 //                Fast Calling Convention (tail call) implementation
2588 //===----------------------------------------------------------------------===//
2589 
2590 //  Like std call, callee cleans arguments, convention except that ECX is
2591 //  reserved for storing the tail called function address. Only 2 registers are
2592 //  free for argument passing (inreg). Tail call optimization is performed
2593 //  provided:
2594 //                * tailcallopt is enabled
2595 //                * caller/callee are fastcc
2596 //  On X86_64 architecture with GOT-style position independent code only local
2597 //  (within module) calls are supported at the moment.
2598 //  To keep the stack aligned according to platform abi the function
2599 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2600 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2601 //  If a tail called function callee has more arguments than the caller the
2602 //  caller needs to make sure that there is room to move the RETADDR to. This is
2603 //  achieved by reserving an area the size of the argument delta right after the
2604 //  original RETADDR, but before the saved framepointer or the spilled registers
2605 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2606 //  stack layout:
2607 //    arg1
2608 //    arg2
2609 //    RETADDR
2610 //    [ new RETADDR
2611 //      move area ]
2612 //    (possible EBP)
2613 //    ESI
2614 //    EDI
2615 //    local1 ..
2616 
2617 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2618 /// requirement.
2619 unsigned
2620 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2621                                                SelectionDAG &DAG) const {
2622   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2623   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2624   assert(StackSize % SlotSize == 0 &&
2625          "StackSize must be a multiple of SlotSize");
2626   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
2627 }
2628 
2629 /// Return true if the given stack call argument is already available in the
2630 /// same position (relatively) of the caller's incoming argument stack.
2631 static
2632 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2633                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2634                          const X86InstrInfo *TII, const CCValAssign &VA) {
2635   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2636 
2637   for (;;) {
2638     // Look through nodes that don't alter the bits of the incoming value.
2639     unsigned Op = Arg.getOpcode();
2640     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
2641         Op == ISD::AssertZext) {
2642       Arg = Arg.getOperand(0);
2643       continue;
2644     }
2645     if (Op == ISD::TRUNCATE) {
2646       const SDValue &TruncInput = Arg.getOperand(0);
2647       if (TruncInput.getOpcode() == ISD::AssertZext &&
2648           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
2649               Arg.getValueType()) {
2650         Arg = TruncInput.getOperand(0);
2651         continue;
2652       }
2653     }
2654     break;
2655   }
2656 
2657   int FI = INT_MAX;
2658   if (Arg.getOpcode() == ISD::CopyFromReg) {
2659     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2660     if (!VR.isVirtual())
2661       return false;
2662     MachineInstr *Def = MRI->getVRegDef(VR);
2663     if (!Def)
2664       return false;
2665     if (!Flags.isByVal()) {
2666       if (!TII->isLoadFromStackSlot(*Def, FI))
2667         return false;
2668     } else {
2669       unsigned Opcode = Def->getOpcode();
2670       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
2671            Opcode == X86::LEA64_32r) &&
2672           Def->getOperand(1).isFI()) {
2673         FI = Def->getOperand(1).getIndex();
2674         Bytes = Flags.getByValSize();
2675       } else
2676         return false;
2677     }
2678   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2679     if (Flags.isByVal())
2680       // ByVal argument is passed in as a pointer but it's now being
2681       // dereferenced. e.g.
2682       // define @foo(%struct.X* %A) {
2683       //   tail call @bar(%struct.X* byval %A)
2684       // }
2685       return false;
2686     SDValue Ptr = Ld->getBasePtr();
2687     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2688     if (!FINode)
2689       return false;
2690     FI = FINode->getIndex();
2691   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2692     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2693     FI = FINode->getIndex();
2694     Bytes = Flags.getByValSize();
2695   } else
2696     return false;
2697 
2698   assert(FI != INT_MAX);
2699   if (!MFI.isFixedObjectIndex(FI))
2700     return false;
2701 
2702   if (Offset != MFI.getObjectOffset(FI))
2703     return false;
2704 
2705   // If this is not byval, check that the argument stack object is immutable.
2706   // inalloca and argument copy elision can create mutable argument stack
2707   // objects. Byval objects can be mutated, but a byval call intends to pass the
2708   // mutated memory.
2709   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
2710     return false;
2711 
2712   if (VA.getLocVT().getFixedSizeInBits() >
2713       Arg.getValueSizeInBits().getFixedValue()) {
2714     // If the argument location is wider than the argument type, check that any
2715     // extension flags match.
2716     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
2717         Flags.isSExt() != MFI.isObjectSExt(FI)) {
2718       return false;
2719     }
2720   }
2721 
2722   return Bytes == MFI.getObjectSize(FI);
2723 }
2724 
2725 /// Check whether the call is eligible for tail call optimization. Targets
2726 /// that want to do tail call optimization should implement this function.
2727 /// Note that the x86 backend does not check musttail calls for eligibility! The
2728 /// rest of x86 tail call lowering must be prepared to forward arguments of any
2729 /// type.
2730 bool X86TargetLowering::IsEligibleForTailCallOptimization(
2731     TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
2732     SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const {
2733   SelectionDAG &DAG = CLI.DAG;
2734   const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2735   const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2736   const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2737   SDValue Callee = CLI.Callee;
2738   CallingConv::ID CalleeCC = CLI.CallConv;
2739   bool isVarArg = CLI.IsVarArg;
2740 
2741   if (!mayTailCallThisCC(CalleeCC))
2742     return false;
2743 
2744   // If -tailcallopt is specified, make fastcc functions tail-callable.
2745   MachineFunction &MF = DAG.getMachineFunction();
2746   const Function &CallerF = MF.getFunction();
2747 
2748   // If the function return type is x86_fp80 and the callee return type is not,
2749   // then the FP_EXTEND of the call result is not a nop. It's not safe to
2750   // perform a tailcall optimization here.
2751   if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty())
2752     return false;
2753 
2754   CallingConv::ID CallerCC = CallerF.getCallingConv();
2755   bool CCMatch = CallerCC == CalleeCC;
2756   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
2757   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
2758   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
2759       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
2760 
2761   // Win64 functions have extra shadow space for argument homing. Don't do the
2762   // sibcall if the caller and callee have mismatched expectations for this
2763   // space.
2764   if (IsCalleeWin64 != IsCallerWin64)
2765     return false;
2766 
2767   if (IsGuaranteeTCO) {
2768     if (canGuaranteeTCO(CalleeCC) && CCMatch)
2769       return true;
2770     return false;
2771   }
2772 
2773   // Look for obvious safe cases to perform tail call optimization that do not
2774   // require ABI changes. This is what gcc calls sibcall.
2775 
2776   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2777   // emit a special epilogue.
2778   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2779   if (RegInfo->hasStackRealignment(MF))
2780     return false;
2781 
2782   // Also avoid sibcall optimization if we're an sret return fn and the callee
2783   // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
2784   // insufficient.
2785   if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
2786     // For a compatible tail call the callee must return our sret pointer. So it
2787     // needs to be (a) an sret function itself and (b) we pass our sret as its
2788     // sret. Condition #b is harder to determine.
2789     return false;
2790   } else if (IsCalleePopSRet)
2791     // The callee pops an sret, so we cannot tail-call, as our caller doesn't
2792     // expect that.
2793     return false;
2794 
2795   // Do not sibcall optimize vararg calls unless all arguments are passed via
2796   // registers.
2797   LLVMContext &C = *DAG.getContext();
2798   if (isVarArg && !Outs.empty()) {
2799     // Optimizing for varargs on Win64 is unlikely to be safe without
2800     // additional testing.
2801     if (IsCalleeWin64 || IsCallerWin64)
2802       return false;
2803 
2804     for (const auto &VA : ArgLocs)
2805       if (!VA.isRegLoc())
2806         return false;
2807   }
2808 
2809   // If the call result is in ST0 / ST1, it needs to be popped off the x87
2810   // stack.  Therefore, if it's not used by the call it is not safe to optimize
2811   // this into a sibcall.
2812   bool Unused = false;
2813   for (const auto &In : Ins) {
2814     if (!In.Used) {
2815       Unused = true;
2816       break;
2817     }
2818   }
2819   if (Unused) {
2820     SmallVector<CCValAssign, 16> RVLocs;
2821     CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C);
2822     RVCCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2823     for (const auto &VA : RVLocs) {
2824       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
2825         return false;
2826     }
2827   }
2828 
2829   // Check that the call results are passed in the same way.
2830   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2831                                   RetCC_X86, RetCC_X86))
2832     return false;
2833   // The callee has to preserve all registers the caller needs to preserve.
2834   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2835   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2836   if (!CCMatch) {
2837     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2838     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2839       return false;
2840   }
2841 
2842   unsigned StackArgsSize = CCInfo.getStackSize();
2843 
2844   // If the callee takes no arguments then go on to check the results of the
2845   // call.
2846   if (!Outs.empty()) {
2847     if (StackArgsSize > 0) {
2848       // Check if the arguments are already laid out in the right way as
2849       // the caller's fixed stack objects.
2850       MachineFrameInfo &MFI = MF.getFrameInfo();
2851       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2852       const X86InstrInfo *TII = Subtarget.getInstrInfo();
2853       for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
2854         const CCValAssign &VA = ArgLocs[I];
2855         SDValue Arg = OutVals[I];
2856         ISD::ArgFlagsTy Flags = Outs[I].Flags;
2857         if (VA.getLocInfo() == CCValAssign::Indirect)
2858           return false;
2859         if (!VA.isRegLoc()) {
2860           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
2861                                    TII, VA))
2862             return false;
2863         }
2864       }
2865     }
2866 
2867     bool PositionIndependent = isPositionIndependent();
2868     // If the tailcall address may be in a register, then make sure it's
2869     // possible to register allocate for it. In 32-bit, the call address can
2870     // only target EAX, EDX, or ECX since the tail call must be scheduled after
2871     // callee-saved registers are restored. These happen to be the same
2872     // registers used to pass 'inreg' arguments so watch out for those.
2873     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
2874                                   !isa<ExternalSymbolSDNode>(Callee)) ||
2875                                  PositionIndependent)) {
2876       unsigned NumInRegs = 0;
2877       // In PIC we need an extra register to formulate the address computation
2878       // for the callee.
2879       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
2880 
2881       for (const auto &VA : ArgLocs) {
2882         if (!VA.isRegLoc())
2883           continue;
2884         Register Reg = VA.getLocReg();
2885         switch (Reg) {
2886         default: break;
2887         case X86::EAX: case X86::EDX: case X86::ECX:
2888           if (++NumInRegs == MaxInRegs)
2889             return false;
2890           break;
2891         }
2892       }
2893     }
2894 
2895     const MachineRegisterInfo &MRI = MF.getRegInfo();
2896     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2897       return false;
2898   }
2899 
2900   bool CalleeWillPop =
2901       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
2902                        MF.getTarget().Options.GuaranteedTailCallOpt);
2903 
2904   if (unsigned BytesToPop =
2905           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
2906     // If we have bytes to pop, the callee must pop them.
2907     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
2908     if (!CalleePopMatches)
2909       return false;
2910   } else if (CalleeWillPop && StackArgsSize > 0) {
2911     // If we don't have bytes to pop, make sure the callee doesn't pop any.
2912     return false;
2913   }
2914 
2915   return true;
2916 }
2917 
2918 /// Determines whether the callee is required to pop its own arguments.
2919 /// Callee pop is necessary to support tail calls.
2920 bool X86::isCalleePop(CallingConv::ID CallingConv,
2921                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
2922   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
2923   // can guarantee TCO.
2924   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
2925     return true;
2926 
2927   switch (CallingConv) {
2928   default:
2929     return false;
2930   case CallingConv::X86_StdCall:
2931   case CallingConv::X86_FastCall:
2932   case CallingConv::X86_ThisCall:
2933   case CallingConv::X86_VectorCall:
2934     return !is64Bit;
2935   }
2936 }
2937