xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLoweringCall.cpp (revision 0d7056458db5b5dd7fdc5ccd8abab73e3ee76a20)
1 //===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file implements the lowering of LLVM calls to DAG nodes.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86.h"
15 #include "X86CallingConv.h"
16 #include "X86FrameLowering.h"
17 #include "X86ISelLowering.h"
18 #include "X86InstrBuilder.h"
19 #include "X86MachineFunctionInfo.h"
20 #include "X86TargetMachine.h"
21 #include "X86TargetObjectFile.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/ObjCARCUtil.h"
24 #include "llvm/CodeGen/MachineJumpTableInfo.h"
25 #include "llvm/CodeGen/MachineModuleInfo.h"
26 #include "llvm/CodeGen/WinEHFuncInfo.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IRBuilder.h"
29 
30 #define DEBUG_TYPE "x86-isel"
31 
32 using namespace llvm;
33 
34 STATISTIC(NumTailCalls, "Number of tail calls");
35 
36 /// Call this when the user attempts to do something unsupported, like
37 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
38 /// report_fatal_error, so calling code should attempt to recover without
39 /// crashing.
40 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
41                              const char *Msg) {
42   MachineFunction &MF = DAG.getMachineFunction();
43   DAG.getContext()->diagnose(
44       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
45 }
46 
47 /// Returns true if a CC can dynamically exclude a register from the list of
48 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
49 /// the return registers.
50 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
51   switch (CC) {
52   default:
53     return false;
54   case CallingConv::X86_RegCall:
55   case CallingConv::PreserveMost:
56   case CallingConv::PreserveAll:
57     return true;
58   }
59 }
60 
61 /// Returns true if a CC can dynamically exclude a register from the list of
62 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
63 /// the parameters.
64 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
65   return CC == CallingConv::X86_RegCall;
66 }
67 
68 static std::pair<MVT, unsigned>
69 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
70                                  const X86Subtarget &Subtarget) {
71   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
72   // convention is one that uses k registers.
73   if (NumElts == 2)
74     return {MVT::v2i64, 1};
75   if (NumElts == 4)
76     return {MVT::v4i32, 1};
77   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
78       CC != CallingConv::Intel_OCL_BI)
79     return {MVT::v8i16, 1};
80   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
81       CC != CallingConv::Intel_OCL_BI)
82     return {MVT::v16i8, 1};
83   // v32i1 passes in ymm unless we have BWI and the calling convention is
84   // regcall.
85   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
86     return {MVT::v32i8, 1};
87   // Split v64i1 vectors if we don't have v64i8 available.
88   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
89     if (Subtarget.useAVX512Regs())
90       return {MVT::v64i8, 1};
91     return {MVT::v32i8, 2};
92   }
93 
94   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
95   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
96       NumElts > 64)
97     return {MVT::i8, NumElts};
98 
99   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
100 }
101 
102 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
103                                                      CallingConv::ID CC,
104                                                      EVT VT) const {
105   if (VT.isVector()) {
106     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
107       unsigned NumElts = VT.getVectorNumElements();
108 
109       MVT RegisterVT;
110       unsigned NumRegisters;
111       std::tie(RegisterVT, NumRegisters) =
112           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
113       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
114         return RegisterVT;
115     }
116 
117     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
118       return MVT::v8f16;
119   }
120 
121   // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
122   if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
123       !Subtarget.hasX87())
124     return MVT::i32;
125 
126   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
127     return getRegisterTypeForCallingConv(Context, CC,
128                                          VT.changeVectorElementType(MVT::f16));
129 
130   if (VT == MVT::bf16)
131     return MVT::f16;
132 
133   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
134 }
135 
136 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
137                                                           CallingConv::ID CC,
138                                                           EVT VT) const {
139   if (VT.isVector()) {
140     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
141       unsigned NumElts = VT.getVectorNumElements();
142 
143       MVT RegisterVT;
144       unsigned NumRegisters;
145       std::tie(RegisterVT, NumRegisters) =
146           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
147       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
148         return NumRegisters;
149     }
150 
151     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
152       return 1;
153   }
154 
155   // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
156   // x87 is disabled.
157   if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
158     if (VT == MVT::f64)
159       return 2;
160     if (VT == MVT::f80)
161       return 3;
162   }
163 
164   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
165     return getNumRegistersForCallingConv(Context, CC,
166                                          VT.changeVectorElementType(MVT::f16));
167 
168   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
169 }
170 
171 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
172     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
173     unsigned &NumIntermediates, MVT &RegisterVT) const {
174   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
175   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
176       Subtarget.hasAVX512() &&
177       (!isPowerOf2_32(VT.getVectorNumElements()) ||
178        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
179        VT.getVectorNumElements() > 64)) {
180     RegisterVT = MVT::i8;
181     IntermediateVT = MVT::i1;
182     NumIntermediates = VT.getVectorNumElements();
183     return NumIntermediates;
184   }
185 
186   // Split v64i1 vectors if we don't have v64i8 available.
187   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
188       CC != CallingConv::X86_RegCall) {
189     RegisterVT = MVT::v32i8;
190     IntermediateVT = MVT::v32i1;
191     NumIntermediates = 2;
192     return 2;
193   }
194 
195   // Split vNbf16 vectors according to vNf16.
196   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
197     VT = VT.changeVectorElementType(MVT::f16);
198 
199   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
200                                               NumIntermediates, RegisterVT);
201 }
202 
203 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
204                                           LLVMContext& Context,
205                                           EVT VT) const {
206   if (!VT.isVector())
207     return MVT::i8;
208 
209   if (Subtarget.hasAVX512()) {
210     // Figure out what this type will be legalized to.
211     EVT LegalVT = VT;
212     while (getTypeAction(Context, LegalVT) != TypeLegal)
213       LegalVT = getTypeToTransformTo(Context, LegalVT);
214 
215     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
216     if (LegalVT.getSimpleVT().is512BitVector())
217       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
218 
219     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
220       // If we legalized to less than a 512-bit vector, then we will use a vXi1
221       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
222       // vXi16/vXi8.
223       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
224       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
225         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
226     }
227   }
228 
229   return VT.changeVectorElementTypeToInteger();
230 }
231 
232 /// Helper for getByValTypeAlignment to determine
233 /// the desired ByVal argument alignment.
234 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
235   if (MaxAlign == 16)
236     return;
237   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
238     if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
239       MaxAlign = Align(16);
240   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
241     Align EltAlign;
242     getMaxByValAlign(ATy->getElementType(), EltAlign);
243     if (EltAlign > MaxAlign)
244       MaxAlign = EltAlign;
245   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
246     for (auto *EltTy : STy->elements()) {
247       Align EltAlign;
248       getMaxByValAlign(EltTy, EltAlign);
249       if (EltAlign > MaxAlign)
250         MaxAlign = EltAlign;
251       if (MaxAlign == 16)
252         break;
253     }
254   }
255 }
256 
257 /// Return the desired alignment for ByVal aggregate
258 /// function arguments in the caller parameter area. For X86, aggregates
259 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
260 /// are at 4-byte boundaries.
261 uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
262                                                   const DataLayout &DL) const {
263   if (Subtarget.is64Bit()) {
264     // Max of 8 and alignment of type.
265     Align TyAlign = DL.getABITypeAlign(Ty);
266     if (TyAlign > 8)
267       return TyAlign.value();
268     return 8;
269   }
270 
271   Align Alignment(4);
272   if (Subtarget.hasSSE1())
273     getMaxByValAlign(Ty, Alignment);
274   return Alignment.value();
275 }
276 
277 /// It returns EVT::Other if the type should be determined using generic
278 /// target-independent logic.
279 /// For vector ops we check that the overall size isn't larger than our
280 /// preferred vector width.
281 EVT X86TargetLowering::getOptimalMemOpType(
282     const MemOp &Op, const AttributeList &FuncAttributes) const {
283   if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
284     if (Op.size() >= 16 &&
285         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
286       // FIXME: Check if unaligned 64-byte accesses are slow.
287       if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
288           (Subtarget.getPreferVectorWidth() >= 512)) {
289         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
290       }
291       // FIXME: Check if unaligned 32-byte accesses are slow.
292       if (Op.size() >= 32 && Subtarget.hasAVX() &&
293           Subtarget.useLight256BitInstructions()) {
294         // Although this isn't a well-supported type for AVX1, we'll let
295         // legalization and shuffle lowering produce the optimal codegen. If we
296         // choose an optimal type with a vector element larger than a byte,
297         // getMemsetStores() may create an intermediate splat (using an integer
298         // multiply) before we splat as a vector.
299         return MVT::v32i8;
300       }
301       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
302         return MVT::v16i8;
303       // TODO: Can SSE1 handle a byte vector?
304       // If we have SSE1 registers we should be able to use them.
305       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
306           (Subtarget.getPreferVectorWidth() >= 128))
307         return MVT::v4f32;
308     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
309                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
310       // Do not use f64 to lower memcpy if source is string constant. It's
311       // better to use i32 to avoid the loads.
312       // Also, do not use f64 to lower memset unless this is a memset of zeros.
313       // The gymnastics of splatting a byte value into an XMM register and then
314       // only using 8-byte stores (because this is a CPU with slow unaligned
315       // 16-byte accesses) makes that a loser.
316       return MVT::f64;
317     }
318   }
319   // This is a compromise. If we reach here, unaligned accesses may be slow on
320   // this target. However, creating smaller, aligned accesses could be even
321   // slower and would certainly be a lot more code.
322   if (Subtarget.is64Bit() && Op.size() >= 8)
323     return MVT::i64;
324   return MVT::i32;
325 }
326 
327 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
328   if (VT == MVT::f32)
329     return Subtarget.hasSSE1();
330   if (VT == MVT::f64)
331     return Subtarget.hasSSE2();
332   return true;
333 }
334 
335 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
336   return (8 * Alignment.value()) % SizeInBits == 0;
337 }
338 
339 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
340   if (isBitAligned(Alignment, VT.getSizeInBits()))
341     return true;
342   switch (VT.getSizeInBits()) {
343   default:
344     // 8-byte and under are always assumed to be fast.
345     return true;
346   case 128:
347     return !Subtarget.isUnalignedMem16Slow();
348   case 256:
349     return !Subtarget.isUnalignedMem32Slow();
350     // TODO: What about AVX-512 (512-bit) accesses?
351   }
352 }
353 
354 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
355     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
356     unsigned *Fast) const {
357   if (Fast)
358     *Fast = isMemoryAccessFast(VT, Alignment);
359   // NonTemporal vector memory ops must be aligned.
360   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
361     // NT loads can only be vector aligned, so if its less aligned than the
362     // minimum vector size (which we can split the vector down to), we might as
363     // well use a regular unaligned vector load.
364     // We don't have any NT loads pre-SSE41.
365     if (!!(Flags & MachineMemOperand::MOLoad))
366       return (Alignment < 16 || !Subtarget.hasSSE41());
367     return false;
368   }
369   // Misaligned accesses of any size are always allowed.
370   return true;
371 }
372 
373 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
374                                            const DataLayout &DL, EVT VT,
375                                            unsigned AddrSpace, Align Alignment,
376                                            MachineMemOperand::Flags Flags,
377                                            unsigned *Fast) const {
378   if (Fast)
379     *Fast = isMemoryAccessFast(VT, Alignment);
380   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
381     if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
382                                        /*Fast=*/nullptr))
383       return true;
384     // NonTemporal vector memory ops are special, and must be aligned.
385     if (!isBitAligned(Alignment, VT.getSizeInBits()))
386       return false;
387     switch (VT.getSizeInBits()) {
388     case 128:
389       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
390         return true;
391       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
392         return true;
393       return false;
394     case 256:
395       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
396         return true;
397       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
398         return true;
399       return false;
400     case 512:
401       if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
402         return true;
403       return false;
404     default:
405       return false; // Don't have NonTemporal vector memory ops of this size.
406     }
407   }
408   return true;
409 }
410 
411 /// Return the entry encoding for a jump table in the
412 /// current function.  The returned value is a member of the
413 /// MachineJumpTableInfo::JTEntryKind enum.
414 unsigned X86TargetLowering::getJumpTableEncoding() const {
415   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
416   // symbol.
417   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
418     return MachineJumpTableInfo::EK_Custom32;
419   if (isPositionIndependent() &&
420       getTargetMachine().getCodeModel() == CodeModel::Large)
421     return MachineJumpTableInfo::EK_LabelDifference64;
422 
423   // Otherwise, use the normal jump table encoding heuristics.
424   return TargetLowering::getJumpTableEncoding();
425 }
426 
427 bool X86TargetLowering::useSoftFloat() const {
428   return Subtarget.useSoftFloat();
429 }
430 
431 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
432                                               ArgListTy &Args) const {
433 
434   // Only relabel X86-32 for C / Stdcall CCs.
435   if (Subtarget.is64Bit())
436     return;
437   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
438     return;
439   unsigned ParamRegs = 0;
440   if (auto *M = MF->getFunction().getParent())
441     ParamRegs = M->getNumberRegisterParameters();
442 
443   // Mark the first N int arguments as having reg
444   for (auto &Arg : Args) {
445     Type *T = Arg.Ty;
446     if (T->isIntOrPtrTy())
447       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
448         unsigned numRegs = 1;
449         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
450           numRegs = 2;
451         if (ParamRegs < numRegs)
452           return;
453         ParamRegs -= numRegs;
454         Arg.IsInReg = true;
455       }
456   }
457 }
458 
459 const MCExpr *
460 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
461                                              const MachineBasicBlock *MBB,
462                                              unsigned uid,MCContext &Ctx) const{
463   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
464   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
465   // entries.
466   return MCSymbolRefExpr::create(MBB->getSymbol(),
467                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
468 }
469 
470 /// Returns relocation base for the given PIC jumptable.
471 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
472                                                     SelectionDAG &DAG) const {
473   if (!Subtarget.is64Bit())
474     // This doesn't have SDLoc associated with it, but is not really the
475     // same as a Register.
476     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
477                        getPointerTy(DAG.getDataLayout()));
478   return Table;
479 }
480 
481 /// This returns the relocation base for the given PIC jumptable,
482 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
483 const MCExpr *X86TargetLowering::
484 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
485                              MCContext &Ctx) const {
486   // X86-64 uses RIP relative addressing based on the jump table label.
487   if (Subtarget.isPICStyleRIPRel() ||
488       (Subtarget.is64Bit() &&
489        getTargetMachine().getCodeModel() == CodeModel::Large))
490     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
491 
492   // Otherwise, the reference is relative to the PIC base.
493   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
494 }
495 
496 std::pair<const TargetRegisterClass *, uint8_t>
497 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
498                                            MVT VT) const {
499   const TargetRegisterClass *RRC = nullptr;
500   uint8_t Cost = 1;
501   switch (VT.SimpleTy) {
502   default:
503     return TargetLowering::findRepresentativeClass(TRI, VT);
504   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
505     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
506     break;
507   case MVT::x86mmx:
508     RRC = &X86::VR64RegClass;
509     break;
510   case MVT::f32: case MVT::f64:
511   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
512   case MVT::v4f32: case MVT::v2f64:
513   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
514   case MVT::v8f32: case MVT::v4f64:
515   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
516   case MVT::v16f32: case MVT::v8f64:
517     RRC = &X86::VR128XRegClass;
518     break;
519   }
520   return std::make_pair(RRC, Cost);
521 }
522 
523 unsigned X86TargetLowering::getAddressSpace() const {
524   if (Subtarget.is64Bit())
525     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
526   return 256;
527 }
528 
529 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
530   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
531          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
532 }
533 
534 static Constant* SegmentOffset(IRBuilderBase &IRB,
535                                int Offset, unsigned AddressSpace) {
536   return ConstantExpr::getIntToPtr(
537       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
538       IRB.getPtrTy(AddressSpace));
539 }
540 
541 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
542   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
543   // tcbhead_t; use it instead of the usual global variable (see
544   // sysdeps/{i386,x86_64}/nptl/tls.h)
545   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
546     unsigned AddressSpace = getAddressSpace();
547 
548     // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
549     if (Subtarget.isTargetFuchsia())
550       return SegmentOffset(IRB, 0x10, AddressSpace);
551 
552     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
553     // Specially, some users may customize the base reg and offset.
554     int Offset = M->getStackProtectorGuardOffset();
555     // If we don't set -stack-protector-guard-offset value:
556     // %fs:0x28, unless we're using a Kernel code model, in which case
557     // it's %gs:0x28.  gs:0x14 on i386.
558     if (Offset == INT_MAX)
559       Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
560 
561     StringRef GuardReg = M->getStackProtectorGuardReg();
562     if (GuardReg == "fs")
563       AddressSpace = X86AS::FS;
564     else if (GuardReg == "gs")
565       AddressSpace = X86AS::GS;
566 
567     // Use symbol guard if user specify.
568     StringRef GuardSymb = M->getStackProtectorGuardSymbol();
569     if (!GuardSymb.empty()) {
570       GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
571       if (!GV) {
572         Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
573                                        : Type::getInt32Ty(M->getContext());
574         GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
575                                 nullptr, GuardSymb, nullptr,
576                                 GlobalValue::NotThreadLocal, AddressSpace);
577         if (!Subtarget.isTargetDarwin())
578           GV->setDSOLocal(M->getDirectAccessExternalData());
579       }
580       return GV;
581     }
582 
583     return SegmentOffset(IRB, Offset, AddressSpace);
584   }
585   return TargetLowering::getIRStackGuard(IRB);
586 }
587 
588 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
589   // MSVC CRT provides functionalities for stack protection.
590   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
591       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
592     // MSVC CRT has a global variable holding security cookie.
593     M.getOrInsertGlobal("__security_cookie",
594                         PointerType::getUnqual(M.getContext()));
595 
596     // MSVC CRT has a function to validate security cookie.
597     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
598         "__security_check_cookie", Type::getVoidTy(M.getContext()),
599         PointerType::getUnqual(M.getContext()));
600     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
601       F->setCallingConv(CallingConv::X86_FastCall);
602       F->addParamAttr(0, Attribute::AttrKind::InReg);
603     }
604     return;
605   }
606 
607   StringRef GuardMode = M.getStackProtectorGuard();
608 
609   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
610   if ((GuardMode == "tls" || GuardMode.empty()) &&
611       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
612     return;
613   TargetLowering::insertSSPDeclarations(M);
614 }
615 
616 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
617   // MSVC CRT has a global variable holding security cookie.
618   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
619       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
620     return M.getGlobalVariable("__security_cookie");
621   }
622   return TargetLowering::getSDagStackGuard(M);
623 }
624 
625 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
626   // MSVC CRT has a function to validate security cookie.
627   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
628       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
629     return M.getFunction("__security_check_cookie");
630   }
631   return TargetLowering::getSSPStackGuardCheck(M);
632 }
633 
634 Value *
635 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
636   // Android provides a fixed TLS slot for the SafeStack pointer. See the
637   // definition of TLS_SLOT_SAFESTACK in
638   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
639   if (Subtarget.isTargetAndroid()) {
640     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
641     // %gs:0x24 on i386
642     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
643     return SegmentOffset(IRB, Offset, getAddressSpace());
644   }
645 
646   // Fuchsia is similar.
647   if (Subtarget.isTargetFuchsia()) {
648     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
649     return SegmentOffset(IRB, 0x18, getAddressSpace());
650   }
651 
652   return TargetLowering::getSafeStackPointerLocation(IRB);
653 }
654 
655 //===----------------------------------------------------------------------===//
656 //               Return Value Calling Convention Implementation
657 //===----------------------------------------------------------------------===//
658 
659 bool X86TargetLowering::CanLowerReturn(
660     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
661     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
662   SmallVector<CCValAssign, 16> RVLocs;
663   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
664   return CCInfo.CheckReturn(Outs, RetCC_X86);
665 }
666 
667 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
668   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
669   return ScratchRegs;
670 }
671 
672 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
673   // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
674   // tests at the moment, which is not what we expected.
675   static const MCPhysReg RCRegs[] = {X86::MXCSR};
676   return RCRegs;
677 }
678 
679 /// Lowers masks values (v*i1) to the local register values
680 /// \returns DAG node after lowering to register type
681 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
682                                const SDLoc &DL, SelectionDAG &DAG) {
683   EVT ValVT = ValArg.getValueType();
684 
685   if (ValVT == MVT::v1i1)
686     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg,
687                        DAG.getIntPtrConstant(0, DL));
688 
689   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
690       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
691     // Two stage lowering might be required
692     // bitcast:   v8i1 -> i8 / v16i1 -> i16
693     // anyextend: i8   -> i32 / i16   -> i32
694     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
695     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
696     if (ValLoc == MVT::i32)
697       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy);
698     return ValToCopy;
699   }
700 
701   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
702       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
703     // One stage lowering is required
704     // bitcast:   v32i1 -> i32 / v64i1 -> i64
705     return DAG.getBitcast(ValLoc, ValArg);
706   }
707 
708   return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg);
709 }
710 
711 /// Breaks v64i1 value into two registers and adds the new node to the DAG
712 static void Passv64i1ArgInRegs(
713     const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
714     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
715     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
716   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
717   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
718   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
719   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
720          "The value should reside in two registers");
721 
722   // Before splitting the value we cast it to i64
723   Arg = DAG.getBitcast(MVT::i64, Arg);
724 
725   // Splitting the value into two i32 types
726   SDValue Lo, Hi;
727   std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32);
728 
729   // Attach the two i32 types into corresponding registers
730   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
731   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
732 }
733 
734 SDValue
735 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
736                                bool isVarArg,
737                                const SmallVectorImpl<ISD::OutputArg> &Outs,
738                                const SmallVectorImpl<SDValue> &OutVals,
739                                const SDLoc &dl, SelectionDAG &DAG) const {
740   MachineFunction &MF = DAG.getMachineFunction();
741   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
742 
743   // In some cases we need to disable registers from the default CSR list.
744   // For example, when they are used as return registers (preserve_* and X86's
745   // regcall) or for argument passing (X86's regcall).
746   bool ShouldDisableCalleeSavedRegister =
747       shouldDisableRetRegFromCSR(CallConv) ||
748       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
749 
750   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
751     report_fatal_error("X86 interrupts may not return any value");
752 
753   SmallVector<CCValAssign, 16> RVLocs;
754   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
755   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
756 
757   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
758   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
759        ++I, ++OutsIndex) {
760     CCValAssign &VA = RVLocs[I];
761     assert(VA.isRegLoc() && "Can only return in registers!");
762 
763     // Add the register to the CalleeSaveDisableRegs list.
764     if (ShouldDisableCalleeSavedRegister)
765       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
766 
767     SDValue ValToCopy = OutVals[OutsIndex];
768     EVT ValVT = ValToCopy.getValueType();
769 
770     // Promote values to the appropriate types.
771     if (VA.getLocInfo() == CCValAssign::SExt)
772       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
773     else if (VA.getLocInfo() == CCValAssign::ZExt)
774       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
775     else if (VA.getLocInfo() == CCValAssign::AExt) {
776       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
777         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
778       else
779         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
780     }
781     else if (VA.getLocInfo() == CCValAssign::BCvt)
782       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
783 
784     assert(VA.getLocInfo() != CCValAssign::FPExt &&
785            "Unexpected FP-extend for return value.");
786 
787     // Report an error if we have attempted to return a value via an XMM
788     // register and SSE was disabled.
789     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
790       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
791       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
792     } else if (!Subtarget.hasSSE2() &&
793                X86::FR64XRegClass.contains(VA.getLocReg()) &&
794                ValVT == MVT::f64) {
795       // When returning a double via an XMM register, report an error if SSE2 is
796       // not enabled.
797       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
798       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
799     }
800 
801     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
802     // the RET instruction and handled by the FP Stackifier.
803     if (VA.getLocReg() == X86::FP0 ||
804         VA.getLocReg() == X86::FP1) {
805       // If this is a copy from an xmm register to ST(0), use an FPExtend to
806       // change the value to the FP stack register class.
807       if (isScalarFPTypeInSSEReg(VA.getValVT()))
808         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
809       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
810       // Don't emit a copytoreg.
811       continue;
812     }
813 
814     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
815     // which is returned in RAX / RDX.
816     if (Subtarget.is64Bit()) {
817       if (ValVT == MVT::x86mmx) {
818         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
819           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
820           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
821                                   ValToCopy);
822           // If we don't have SSE2 available, convert to v4f32 so the generated
823           // register is legal.
824           if (!Subtarget.hasSSE2())
825             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
826         }
827       }
828     }
829 
830     if (VA.needsCustom()) {
831       assert(VA.getValVT() == MVT::v64i1 &&
832              "Currently the only custom case is when we split v64i1 to 2 regs");
833 
834       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
835                          Subtarget);
836 
837       // Add the second register to the CalleeSaveDisableRegs list.
838       if (ShouldDisableCalleeSavedRegister)
839         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
840     } else {
841       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
842     }
843   }
844 
845   SDValue Glue;
846   SmallVector<SDValue, 6> RetOps;
847   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
848   // Operand #1 = Bytes To Pop
849   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
850                    MVT::i32));
851 
852   // Copy the result values into the output registers.
853   for (auto &RetVal : RetVals) {
854     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
855       RetOps.push_back(RetVal.second);
856       continue; // Don't emit a copytoreg.
857     }
858 
859     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
860     Glue = Chain.getValue(1);
861     RetOps.push_back(
862         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
863   }
864 
865   // Swift calling convention does not require we copy the sret argument
866   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
867 
868   // All x86 ABIs require that for returning structs by value we copy
869   // the sret argument into %rax/%eax (depending on ABI) for the return.
870   // We saved the argument into a virtual register in the entry block,
871   // so now we copy the value out and into %rax/%eax.
872   //
873   // Checking Function.hasStructRetAttr() here is insufficient because the IR
874   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
875   // false, then an sret argument may be implicitly inserted in the SelDAG. In
876   // either case FuncInfo->setSRetReturnReg() will have been called.
877   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
878     // When we have both sret and another return value, we should use the
879     // original Chain stored in RetOps[0], instead of the current Chain updated
880     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
881 
882     // For the case of sret and another return value, we have
883     //   Chain_0 at the function entry
884     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
885     // If we use Chain_1 in getCopyFromReg, we will have
886     //   Val = getCopyFromReg(Chain_1)
887     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
888 
889     // getCopyToReg(Chain_0) will be glued together with
890     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
891     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
892     //   Data dependency from Unit B to Unit A due to usage of Val in
893     //     getCopyToReg(Chain_1, Val)
894     //   Chain dependency from Unit A to Unit B
895 
896     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
897     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
898                                      getPointerTy(MF.getDataLayout()));
899 
900     Register RetValReg
901         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
902           X86::RAX : X86::EAX;
903     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
904     Glue = Chain.getValue(1);
905 
906     // RAX/EAX now acts like a return value.
907     RetOps.push_back(
908         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
909 
910     // Add the returned register to the CalleeSaveDisableRegs list. Don't do
911     // this however for preserve_most/preserve_all to minimize the number of
912     // callee-saved registers for these CCs.
913     if (ShouldDisableCalleeSavedRegister &&
914         CallConv != CallingConv::PreserveAll &&
915         CallConv != CallingConv::PreserveMost)
916       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
917   }
918 
919   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
920   const MCPhysReg *I =
921       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
922   if (I) {
923     for (; *I; ++I) {
924       if (X86::GR64RegClass.contains(*I))
925         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
926       else
927         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
928     }
929   }
930 
931   RetOps[0] = Chain;  // Update chain.
932 
933   // Add the glue if we have it.
934   if (Glue.getNode())
935     RetOps.push_back(Glue);
936 
937   X86ISD::NodeType opcode = X86ISD::RET_GLUE;
938   if (CallConv == CallingConv::X86_INTR)
939     opcode = X86ISD::IRET;
940   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
941 }
942 
943 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
944   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
945     return false;
946 
947   SDValue TCChain = Chain;
948   SDNode *Copy = *N->use_begin();
949   if (Copy->getOpcode() == ISD::CopyToReg) {
950     // If the copy has a glue operand, we conservatively assume it isn't safe to
951     // perform a tail call.
952     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
953       return false;
954     TCChain = Copy->getOperand(0);
955   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
956     return false;
957 
958   bool HasRet = false;
959   for (const SDNode *U : Copy->uses()) {
960     if (U->getOpcode() != X86ISD::RET_GLUE)
961       return false;
962     // If we are returning more than one value, we can definitely
963     // not make a tail call see PR19530
964     if (U->getNumOperands() > 4)
965       return false;
966     if (U->getNumOperands() == 4 &&
967         U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
968       return false;
969     HasRet = true;
970   }
971 
972   if (!HasRet)
973     return false;
974 
975   Chain = TCChain;
976   return true;
977 }
978 
979 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
980                                            ISD::NodeType ExtendKind) const {
981   MVT ReturnMVT = MVT::i32;
982 
983   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
984   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
985     // The ABI does not require i1, i8 or i16 to be extended.
986     //
987     // On Darwin, there is code in the wild relying on Clang's old behaviour of
988     // always extending i8/i16 return values, so keep doing that for now.
989     // (PR26665).
990     ReturnMVT = MVT::i8;
991   }
992 
993   EVT MinVT = getRegisterType(Context, ReturnMVT);
994   return VT.bitsLT(MinVT) ? MinVT : VT;
995 }
996 
997 /// Reads two 32 bit registers and creates a 64 bit mask value.
998 /// \param VA The current 32 bit value that need to be assigned.
999 /// \param NextVA The next 32 bit value that need to be assigned.
1000 /// \param Root The parent DAG node.
1001 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1002 ///                        glue purposes. In the case the DAG is already using
1003 ///                        physical register instead of virtual, we should glue
1004 ///                        our new SDValue to InGlue SDvalue.
1005 /// \return a new SDvalue of size 64bit.
1006 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1007                                 SDValue &Root, SelectionDAG &DAG,
1008                                 const SDLoc &DL, const X86Subtarget &Subtarget,
1009                                 SDValue *InGlue = nullptr) {
1010   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1011   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1012   assert(VA.getValVT() == MVT::v64i1 &&
1013          "Expecting first location of 64 bit width type");
1014   assert(NextVA.getValVT() == VA.getValVT() &&
1015          "The locations should have the same type");
1016   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1017          "The values should reside in two registers");
1018 
1019   SDValue Lo, Hi;
1020   SDValue ArgValueLo, ArgValueHi;
1021 
1022   MachineFunction &MF = DAG.getMachineFunction();
1023   const TargetRegisterClass *RC = &X86::GR32RegClass;
1024 
1025   // Read a 32 bit value from the registers.
1026   if (nullptr == InGlue) {
1027     // When no physical register is present,
1028     // create an intermediate virtual register.
1029     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1030     ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1031     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
1032     ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1033   } else {
1034     // When a physical register is available read the value from it and glue
1035     // the reads together.
1036     ArgValueLo =
1037       DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue);
1038     *InGlue = ArgValueLo.getValue(2);
1039     ArgValueHi =
1040       DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue);
1041     *InGlue = ArgValueHi.getValue(2);
1042   }
1043 
1044   // Convert the i32 type into v32i1 type.
1045   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
1046 
1047   // Convert the i32 type into v32i1 type.
1048   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
1049 
1050   // Concatenate the two values together.
1051   return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi);
1052 }
1053 
1054 /// The function will lower a register of various sizes (8/16/32/64)
1055 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1056 /// \returns a DAG node contains the operand after lowering to mask type.
1057 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1058                                const EVT &ValLoc, const SDLoc &DL,
1059                                SelectionDAG &DAG) {
1060   SDValue ValReturned = ValArg;
1061 
1062   if (ValVT == MVT::v1i1)
1063     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned);
1064 
1065   if (ValVT == MVT::v64i1) {
1066     // In 32 bit machine, this case is handled by getv64i1Argument
1067     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1068     // In 64 bit machine, There is no need to truncate the value only bitcast
1069   } else {
1070     MVT MaskLenVT;
1071     switch (ValVT.getSimpleVT().SimpleTy) {
1072     case MVT::v8i1:
1073       MaskLenVT = MVT::i8;
1074       break;
1075     case MVT::v16i1:
1076       MaskLenVT = MVT::i16;
1077       break;
1078     case MVT::v32i1:
1079       MaskLenVT = MVT::i32;
1080       break;
1081     default:
1082       llvm_unreachable("Expecting a vector of i1 types");
1083     }
1084 
1085     ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned);
1086   }
1087   return DAG.getBitcast(ValVT, ValReturned);
1088 }
1089 
1090 /// Lower the result values of a call into the
1091 /// appropriate copies out of appropriate physical registers.
1092 ///
1093 SDValue X86TargetLowering::LowerCallResult(
1094     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1095     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1096     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1097     uint32_t *RegMask) const {
1098 
1099   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1100   // Assign locations to each value returned by this call.
1101   SmallVector<CCValAssign, 16> RVLocs;
1102   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1103                  *DAG.getContext());
1104   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1105 
1106   // Copy all of the result registers out of their specified physreg.
1107   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
1108        ++I, ++InsIndex) {
1109     CCValAssign &VA = RVLocs[I];
1110     EVT CopyVT = VA.getLocVT();
1111 
1112     // In some calling conventions we need to remove the used registers
1113     // from the register mask.
1114     if (RegMask) {
1115       for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
1116         RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
1117     }
1118 
1119     // Report an error if there was an attempt to return FP values via XMM
1120     // registers.
1121     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
1122       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
1123       if (VA.getLocReg() == X86::XMM1)
1124         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1125       else
1126         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1127     } else if (!Subtarget.hasSSE2() &&
1128                X86::FR64XRegClass.contains(VA.getLocReg()) &&
1129                CopyVT == MVT::f64) {
1130       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
1131       if (VA.getLocReg() == X86::XMM1)
1132         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1133       else
1134         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1135     }
1136 
1137     // If we prefer to use the value in xmm registers, copy it out as f80 and
1138     // use a truncate to move it from fp stack reg to xmm reg.
1139     bool RoundAfterCopy = false;
1140     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
1141         isScalarFPTypeInSSEReg(VA.getValVT())) {
1142       if (!Subtarget.hasX87())
1143         report_fatal_error("X87 register return with X87 disabled");
1144       CopyVT = MVT::f80;
1145       RoundAfterCopy = (CopyVT != VA.getLocVT());
1146     }
1147 
1148     SDValue Val;
1149     if (VA.needsCustom()) {
1150       assert(VA.getValVT() == MVT::v64i1 &&
1151              "Currently the only custom case is when we split v64i1 to 2 regs");
1152       Val =
1153           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
1154     } else {
1155       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1156                   .getValue(1);
1157       Val = Chain.getValue(0);
1158       InGlue = Chain.getValue(2);
1159     }
1160 
1161     if (RoundAfterCopy)
1162       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1163                         // This truncation won't change the value.
1164                         DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
1165 
1166     if (VA.isExtInLoc()) {
1167       if (VA.getValVT().isVector() &&
1168           VA.getValVT().getScalarType() == MVT::i1 &&
1169           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1170            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1171         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1172         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
1173       } else
1174         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1175     }
1176 
1177     if (VA.getLocInfo() == CCValAssign::BCvt)
1178       Val = DAG.getBitcast(VA.getValVT(), Val);
1179 
1180     InVals.push_back(Val);
1181   }
1182 
1183   return Chain;
1184 }
1185 
1186 //===----------------------------------------------------------------------===//
1187 //                C & StdCall & Fast Calling Convention implementation
1188 //===----------------------------------------------------------------------===//
1189 //  StdCall calling convention seems to be standard for many Windows' API
1190 //  routines and around. It differs from C calling convention just a little:
1191 //  callee should clean up the stack, not caller. Symbols should be also
1192 //  decorated in some fancy way :) It doesn't support any vector arguments.
1193 //  For info on fast calling convention see Fast Calling Convention (tail call)
1194 //  implementation LowerX86_32FastCCCallTo.
1195 
1196 /// Determines whether Args, either a set of outgoing arguments to a call, or a
1197 /// set of incoming args of a call, contains an sret pointer that the callee
1198 /// pops
1199 template <typename T>
1200 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1201                              const X86Subtarget &Subtarget) {
1202   // Not C++20 (yet), so no concepts available.
1203   static_assert(std::is_same_v<T, ISD::OutputArg> ||
1204                     std::is_same_v<T, ISD::InputArg>,
1205                 "requires ISD::OutputArg or ISD::InputArg");
1206 
1207   // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
1208   // for most compilations.
1209   if (!Subtarget.is32Bit())
1210     return false;
1211 
1212   if (Args.empty())
1213     return false;
1214 
1215   // Most calls do not have an sret argument, check the arg next.
1216   const ISD::ArgFlagsTy &Flags = Args[0].Flags;
1217   if (!Flags.isSRet() || Flags.isInReg())
1218     return false;
1219 
1220   // The MSVCabi does not pop the sret.
1221   if (Subtarget.getTargetTriple().isOSMSVCRT())
1222     return false;
1223 
1224   // MCUs don't pop the sret
1225   if (Subtarget.isTargetMCU())
1226     return false;
1227 
1228   // Callee pops argument
1229   return true;
1230 }
1231 
1232 /// Make a copy of an aggregate at address specified by "Src" to address
1233 /// "Dst" with size and alignment information specified by the specific
1234 /// parameter attribute. The copy will be passed as a byval function parameter.
1235 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1236                                          SDValue Chain, ISD::ArgFlagsTy Flags,
1237                                          SelectionDAG &DAG, const SDLoc &dl) {
1238   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
1239 
1240   return DAG.getMemcpy(
1241       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
1242       /*isVolatile*/ false, /*AlwaysInline=*/true,
1243       /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
1244 }
1245 
1246 /// Return true if the calling convention is one that we can guarantee TCO for.
1247 static bool canGuaranteeTCO(CallingConv::ID CC) {
1248   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1249           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
1250           CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
1251 }
1252 
1253 /// Return true if we might ever do TCO for calls with this calling convention.
1254 static bool mayTailCallThisCC(CallingConv::ID CC) {
1255   switch (CC) {
1256   // C calling conventions:
1257   case CallingConv::C:
1258   case CallingConv::Win64:
1259   case CallingConv::X86_64_SysV:
1260   // Callee pop conventions:
1261   case CallingConv::X86_ThisCall:
1262   case CallingConv::X86_StdCall:
1263   case CallingConv::X86_VectorCall:
1264   case CallingConv::X86_FastCall:
1265   // Swift:
1266   case CallingConv::Swift:
1267     return true;
1268   default:
1269     return canGuaranteeTCO(CC);
1270   }
1271 }
1272 
1273 /// Return true if the function is being made into a tailcall target by
1274 /// changing its ABI.
1275 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1276   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
1277          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
1278 }
1279 
1280 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1281   if (!CI->isTailCall())
1282     return false;
1283 
1284   CallingConv::ID CalleeCC = CI->getCallingConv();
1285   if (!mayTailCallThisCC(CalleeCC))
1286     return false;
1287 
1288   return true;
1289 }
1290 
1291 SDValue
1292 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1293                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1294                                     const SDLoc &dl, SelectionDAG &DAG,
1295                                     const CCValAssign &VA,
1296                                     MachineFrameInfo &MFI, unsigned i) const {
1297   // Create the nodes corresponding to a load from this parameter slot.
1298   ISD::ArgFlagsTy Flags = Ins[i].Flags;
1299   bool AlwaysUseMutable = shouldGuaranteeTCO(
1300       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
1301   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1302   EVT ValVT;
1303   MVT PtrVT = getPointerTy(DAG.getDataLayout());
1304 
1305   // If value is passed by pointer we have address passed instead of the value
1306   // itself. No need to extend if the mask value and location share the same
1307   // absolute size.
1308   bool ExtendedInMem =
1309       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1310       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1311 
1312   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
1313     ValVT = VA.getLocVT();
1314   else
1315     ValVT = VA.getValVT();
1316 
1317   // FIXME: For now, all byval parameter objects are marked mutable. This can be
1318   // changed with more analysis.
1319   // In case of tail call optimization mark all arguments mutable. Since they
1320   // could be overwritten by lowering of arguments in case of a tail call.
1321   if (Flags.isByVal()) {
1322     unsigned Bytes = Flags.getByValSize();
1323     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1324 
1325     // FIXME: For now, all byval parameter objects are marked as aliasing. This
1326     // can be improved with deeper analysis.
1327     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
1328                                    /*isAliased=*/true);
1329     return DAG.getFrameIndex(FI, PtrVT);
1330   }
1331 
1332   EVT ArgVT = Ins[i].ArgVT;
1333 
1334   // If this is a vector that has been split into multiple parts, don't elide
1335   // the copy. The layout on the stack may not match the packed in-memory
1336   // layout.
1337   bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1338 
1339   // This is an argument in memory. We might be able to perform copy elision.
1340   // If the argument is passed directly in memory without any extension, then we
1341   // can perform copy elision. Large vector types, for example, may be passed
1342   // indirectly by pointer.
1343   if (Flags.isCopyElisionCandidate() &&
1344       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1345       !ScalarizedVector) {
1346     SDValue PartAddr;
1347     if (Ins[i].PartOffset == 0) {
1348       // If this is a one-part value or the first part of a multi-part value,
1349       // create a stack object for the entire argument value type and return a
1350       // load from our portion of it. This assumes that if the first part of an
1351       // argument is in memory, the rest will also be in memory.
1352       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
1353                                      /*IsImmutable=*/false);
1354       PartAddr = DAG.getFrameIndex(FI, PtrVT);
1355       return DAG.getLoad(
1356           ValVT, dl, Chain, PartAddr,
1357           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
1358     }
1359 
1360     // This is not the first piece of an argument in memory. See if there is
1361     // already a fixed stack object including this offset. If so, assume it
1362     // was created by the PartOffset == 0 branch above and create a load from
1363     // the appropriate offset into it.
1364     int64_t PartBegin = VA.getLocMemOffset();
1365     int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
1366     int FI = MFI.getObjectIndexBegin();
1367     for (; MFI.isFixedObjectIndex(FI); ++FI) {
1368       int64_t ObjBegin = MFI.getObjectOffset(FI);
1369       int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
1370       if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1371         break;
1372     }
1373     if (MFI.isFixedObjectIndex(FI)) {
1374       SDValue Addr =
1375           DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
1376                       DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
1377       return DAG.getLoad(ValVT, dl, Chain, Addr,
1378                          MachinePointerInfo::getFixedStack(
1379                              DAG.getMachineFunction(), FI, Ins[i].PartOffset));
1380     }
1381   }
1382 
1383   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
1384                                  VA.getLocMemOffset(), isImmutable);
1385 
1386   // Set SExt or ZExt flag.
1387   if (VA.getLocInfo() == CCValAssign::ZExt) {
1388     MFI.setObjectZExt(FI, true);
1389   } else if (VA.getLocInfo() == CCValAssign::SExt) {
1390     MFI.setObjectSExt(FI, true);
1391   }
1392 
1393   MaybeAlign Alignment;
1394   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1395       ValVT != MVT::f80)
1396     Alignment = MaybeAlign(4);
1397   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1398   SDValue Val = DAG.getLoad(
1399       ValVT, dl, Chain, FIN,
1400       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1401       Alignment);
1402   return ExtendedInMem
1403              ? (VA.getValVT().isVector()
1404                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
1405                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
1406              : Val;
1407 }
1408 
1409 // FIXME: Get this from tablegen.
1410 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1411                                                 const X86Subtarget &Subtarget) {
1412   assert(Subtarget.is64Bit());
1413 
1414   if (Subtarget.isCallingConvWin64(CallConv)) {
1415     static const MCPhysReg GPR64ArgRegsWin64[] = {
1416       X86::RCX, X86::RDX, X86::R8,  X86::R9
1417     };
1418     return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
1419   }
1420 
1421   static const MCPhysReg GPR64ArgRegs64Bit[] = {
1422     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1423   };
1424   return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
1425 }
1426 
1427 // FIXME: Get this from tablegen.
1428 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1429                                                 CallingConv::ID CallConv,
1430                                                 const X86Subtarget &Subtarget) {
1431   assert(Subtarget.is64Bit());
1432   if (Subtarget.isCallingConvWin64(CallConv)) {
1433     // The XMM registers which might contain var arg parameters are shadowed
1434     // in their paired GPR.  So we only need to save the GPR to their home
1435     // slots.
1436     // TODO: __vectorcall will change this.
1437     return std::nullopt;
1438   }
1439 
1440   bool isSoftFloat = Subtarget.useSoftFloat();
1441   if (isSoftFloat || !Subtarget.hasSSE1())
1442     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
1443     // registers.
1444     return std::nullopt;
1445 
1446   static const MCPhysReg XMMArgRegs64Bit[] = {
1447     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1448     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1449   };
1450   return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
1451 }
1452 
1453 #ifndef NDEBUG
1454 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1455   return llvm::is_sorted(
1456       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1457         return A.getValNo() < B.getValNo();
1458       });
1459 }
1460 #endif
1461 
1462 namespace {
1463 /// This is a helper class for lowering variable arguments parameters.
1464 class VarArgsLoweringHelper {
1465 public:
1466   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
1467                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
1468                         CallingConv::ID CallConv, CCState &CCInfo)
1469       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1470         TheMachineFunction(DAG.getMachineFunction()),
1471         TheFunction(TheMachineFunction.getFunction()),
1472         FrameInfo(TheMachineFunction.getFrameInfo()),
1473         FrameLowering(*Subtarget.getFrameLowering()),
1474         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1475         CCInfo(CCInfo) {}
1476 
1477   // Lower variable arguments parameters.
1478   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1479 
1480 private:
1481   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1482 
1483   void forwardMustTailParameters(SDValue &Chain);
1484 
1485   bool is64Bit() const { return Subtarget.is64Bit(); }
1486   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
1487 
1488   X86MachineFunctionInfo *FuncInfo;
1489   const SDLoc &DL;
1490   SelectionDAG &DAG;
1491   const X86Subtarget &Subtarget;
1492   MachineFunction &TheMachineFunction;
1493   const Function &TheFunction;
1494   MachineFrameInfo &FrameInfo;
1495   const TargetFrameLowering &FrameLowering;
1496   const TargetLowering &TargLowering;
1497   CallingConv::ID CallConv;
1498   CCState &CCInfo;
1499 };
1500 } // namespace
1501 
1502 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1503     SDValue &Chain, unsigned StackSize) {
1504   // If the function takes variable number of arguments, make a frame index for
1505   // the start of the first vararg value... for expansion of llvm.va_start. We
1506   // can skip this if there are no va_start calls.
1507   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
1508                     CallConv != CallingConv::X86_ThisCall)) {
1509     FuncInfo->setVarArgsFrameIndex(
1510         FrameInfo.CreateFixedObject(1, StackSize, true));
1511   }
1512 
1513   // 64-bit calling conventions support varargs and register parameters, so we
1514   // have to do extra work to spill them in the prologue.
1515   if (is64Bit()) {
1516     // Find the first unallocated argument registers.
1517     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1518     ArrayRef<MCPhysReg> ArgXMMs =
1519         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
1520     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
1521     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
1522 
1523     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1524            "SSE register cannot be used when SSE is disabled!");
1525 
1526     if (isWin64()) {
1527       // Get to the caller-allocated home save location.  Add 8 to account
1528       // for the return address.
1529       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
1530       FuncInfo->setRegSaveFrameIndex(
1531           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
1532       // Fixup to set vararg frame on shadow area (4 x i64).
1533       if (NumIntRegs < 4)
1534         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1535     } else {
1536       // For X86-64, if there are vararg parameters that are passed via
1537       // registers, then we must store them to their spots on the stack so
1538       // they may be loaded by dereferencing the result of va_next.
1539       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1540       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
1541       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1542           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
1543     }
1544 
1545     SmallVector<SDValue, 6>
1546         LiveGPRs; // list of SDValue for GPR registers keeping live input value
1547     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
1548                                          // keeping live input value
1549     SDValue ALVal; // if applicable keeps SDValue for %al register
1550 
1551     // Gather all the live in physical registers.
1552     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
1553       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
1554       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
1555     }
1556     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
1557     if (!AvailableXmms.empty()) {
1558       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1559       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
1560       for (MCPhysReg Reg : AvailableXmms) {
1561         // FastRegisterAllocator spills virtual registers at basic
1562         // block boundary. That leads to usages of xmm registers
1563         // outside of check for %al. Pass physical registers to
1564         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1565         TheMachineFunction.getRegInfo().addLiveIn(Reg);
1566         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
1567       }
1568     }
1569 
1570     // Store the integer parameter registers.
1571     SmallVector<SDValue, 8> MemOps;
1572     SDValue RSFIN =
1573         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1574                           TargLowering.getPointerTy(DAG.getDataLayout()));
1575     unsigned Offset = FuncInfo->getVarArgsGPOffset();
1576     for (SDValue Val : LiveGPRs) {
1577       SDValue FIN = DAG.getNode(ISD::ADD, DL,
1578                                 TargLowering.getPointerTy(DAG.getDataLayout()),
1579                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
1580       SDValue Store =
1581           DAG.getStore(Val.getValue(1), DL, Val, FIN,
1582                        MachinePointerInfo::getFixedStack(
1583                            DAG.getMachineFunction(),
1584                            FuncInfo->getRegSaveFrameIndex(), Offset));
1585       MemOps.push_back(Store);
1586       Offset += 8;
1587     }
1588 
1589     // Now store the XMM (fp + vector) parameter registers.
1590     if (!LiveXMMRegs.empty()) {
1591       SmallVector<SDValue, 12> SaveXMMOps;
1592       SaveXMMOps.push_back(Chain);
1593       SaveXMMOps.push_back(ALVal);
1594       SaveXMMOps.push_back(RSFIN);
1595       SaveXMMOps.push_back(
1596           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
1597       llvm::append_range(SaveXMMOps, LiveXMMRegs);
1598       MachineMemOperand *StoreMMO =
1599           DAG.getMachineFunction().getMachineMemOperand(
1600               MachinePointerInfo::getFixedStack(
1601                   DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
1602                   Offset),
1603               MachineMemOperand::MOStore, 128, Align(16));
1604       MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
1605                                                DL, DAG.getVTList(MVT::Other),
1606                                                SaveXMMOps, MVT::i8, StoreMMO));
1607     }
1608 
1609     if (!MemOps.empty())
1610       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
1611   }
1612 }
1613 
1614 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1615   // Find the largest legal vector type.
1616   MVT VecVT = MVT::Other;
1617   // FIXME: Only some x86_32 calling conventions support AVX512.
1618   if (Subtarget.useAVX512Regs() &&
1619       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
1620                      CallConv == CallingConv::Intel_OCL_BI)))
1621     VecVT = MVT::v16f32;
1622   else if (Subtarget.hasAVX())
1623     VecVT = MVT::v8f32;
1624   else if (Subtarget.hasSSE2())
1625     VecVT = MVT::v4f32;
1626 
1627   // We forward some GPRs and some vector types.
1628   SmallVector<MVT, 2> RegParmTypes;
1629   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1630   RegParmTypes.push_back(IntVT);
1631   if (VecVT != MVT::Other)
1632     RegParmTypes.push_back(VecVT);
1633 
1634   // Compute the set of forwarded registers. The rest are scratch.
1635   SmallVectorImpl<ForwardedRegister> &Forwards =
1636       FuncInfo->getForwardedMustTailRegParms();
1637   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
1638 
1639   // Forward AL for SysV x86_64 targets, since it is used for varargs.
1640   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
1641     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1642     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
1643   }
1644 
1645   // Copy all forwards from physical to virtual registers.
1646   for (ForwardedRegister &FR : Forwards) {
1647     // FIXME: Can we use a less constrained schedule?
1648     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
1649     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1650         TargLowering.getRegClassFor(FR.VT));
1651     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
1652   }
1653 }
1654 
1655 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1656                                                    unsigned StackSize) {
1657   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1658   // If necessary, it would be set into the correct value later.
1659   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1660   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1661 
1662   if (FrameInfo.hasVAStart())
1663     createVarArgAreaAndStoreRegisters(Chain, StackSize);
1664 
1665   if (FrameInfo.hasMustTailInVarArgFunc())
1666     forwardMustTailParameters(Chain);
1667 }
1668 
1669 SDValue X86TargetLowering::LowerFormalArguments(
1670     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1671     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1672     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1673   MachineFunction &MF = DAG.getMachineFunction();
1674   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1675 
1676   const Function &F = MF.getFunction();
1677   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1678       F.getName() == "main")
1679     FuncInfo->setForceFramePointer(true);
1680 
1681   MachineFrameInfo &MFI = MF.getFrameInfo();
1682   bool Is64Bit = Subtarget.is64Bit();
1683   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
1684 
1685   assert(
1686       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
1687       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1688 
1689   // Assign locations to all of the incoming arguments.
1690   SmallVector<CCValAssign, 16> ArgLocs;
1691   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1692 
1693   // Allocate shadow area for Win64.
1694   if (IsWin64)
1695     CCInfo.AllocateStack(32, Align(8));
1696 
1697   CCInfo.AnalyzeArguments(Ins, CC_X86);
1698 
1699   // In vectorcall calling convention a second pass is required for the HVA
1700   // types.
1701   if (CallingConv::X86_VectorCall == CallConv) {
1702     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
1703   }
1704 
1705   // The next loop assumes that the locations are in the same order of the
1706   // input arguments.
1707   assert(isSortedByValueNo(ArgLocs) &&
1708          "Argument Location list must be sorted before lowering");
1709 
1710   SDValue ArgValue;
1711   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
1712        ++I, ++InsIndex) {
1713     assert(InsIndex < Ins.size() && "Invalid Ins index");
1714     CCValAssign &VA = ArgLocs[I];
1715 
1716     if (VA.isRegLoc()) {
1717       EVT RegVT = VA.getLocVT();
1718       if (VA.needsCustom()) {
1719         assert(
1720             VA.getValVT() == MVT::v64i1 &&
1721             "Currently the only custom case is when we split v64i1 to 2 regs");
1722 
1723         // v64i1 values, in regcall calling convention, that are
1724         // compiled to 32 bit arch, are split up into two registers.
1725         ArgValue =
1726             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
1727       } else {
1728         const TargetRegisterClass *RC;
1729         if (RegVT == MVT::i8)
1730           RC = &X86::GR8RegClass;
1731         else if (RegVT == MVT::i16)
1732           RC = &X86::GR16RegClass;
1733         else if (RegVT == MVT::i32)
1734           RC = &X86::GR32RegClass;
1735         else if (Is64Bit && RegVT == MVT::i64)
1736           RC = &X86::GR64RegClass;
1737         else if (RegVT == MVT::f16)
1738           RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1739         else if (RegVT == MVT::f32)
1740           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1741         else if (RegVT == MVT::f64)
1742           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1743         else if (RegVT == MVT::f80)
1744           RC = &X86::RFP80RegClass;
1745         else if (RegVT == MVT::f128)
1746           RC = &X86::VR128RegClass;
1747         else if (RegVT.is512BitVector())
1748           RC = &X86::VR512RegClass;
1749         else if (RegVT.is256BitVector())
1750           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1751         else if (RegVT.is128BitVector())
1752           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1753         else if (RegVT == MVT::x86mmx)
1754           RC = &X86::VR64RegClass;
1755         else if (RegVT == MVT::v1i1)
1756           RC = &X86::VK1RegClass;
1757         else if (RegVT == MVT::v8i1)
1758           RC = &X86::VK8RegClass;
1759         else if (RegVT == MVT::v16i1)
1760           RC = &X86::VK16RegClass;
1761         else if (RegVT == MVT::v32i1)
1762           RC = &X86::VK32RegClass;
1763         else if (RegVT == MVT::v64i1)
1764           RC = &X86::VK64RegClass;
1765         else
1766           llvm_unreachable("Unknown argument type!");
1767 
1768         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1769         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1770       }
1771 
1772       // If this is an 8 or 16-bit value, it is really passed promoted to 32
1773       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1774       // right size.
1775       if (VA.getLocInfo() == CCValAssign::SExt)
1776         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1777                                DAG.getValueType(VA.getValVT()));
1778       else if (VA.getLocInfo() == CCValAssign::ZExt)
1779         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1780                                DAG.getValueType(VA.getValVT()));
1781       else if (VA.getLocInfo() == CCValAssign::BCvt)
1782         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
1783 
1784       if (VA.isExtInLoc()) {
1785         // Handle MMX values passed in XMM regs.
1786         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1787           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
1788         else if (VA.getValVT().isVector() &&
1789                  VA.getValVT().getScalarType() == MVT::i1 &&
1790                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1791                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1792           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1793           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
1794         } else
1795           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1796       }
1797     } else {
1798       assert(VA.isMemLoc());
1799       ArgValue =
1800           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
1801     }
1802 
1803     // If value is passed via pointer - do a load.
1804     if (VA.getLocInfo() == CCValAssign::Indirect &&
1805         !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
1806       ArgValue =
1807           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
1808     }
1809 
1810     InVals.push_back(ArgValue);
1811   }
1812 
1813   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
1814     if (Ins[I].Flags.isSwiftAsync()) {
1815       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1816       if (Subtarget.is64Bit())
1817         X86FI->setHasSwiftAsyncContext(true);
1818       else {
1819         int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
1820         X86FI->setSwiftAsyncContextFrameIdx(FI);
1821         SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
1822                                   DAG.getFrameIndex(FI, MVT::i32),
1823                                   MachinePointerInfo::getFixedStack(MF, FI));
1824         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
1825       }
1826     }
1827 
1828     // Swift calling convention does not require we copy the sret argument
1829     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1830     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
1831       continue;
1832 
1833     // All x86 ABIs require that for returning structs by value we copy the
1834     // sret argument into %rax/%eax (depending on ABI) for the return. Save
1835     // the argument into a virtual register so that we can access it from the
1836     // return points.
1837     if (Ins[I].Flags.isSRet()) {
1838       assert(!FuncInfo->getSRetReturnReg() &&
1839              "SRet return has already been set");
1840       MVT PtrTy = getPointerTy(DAG.getDataLayout());
1841       Register Reg =
1842           MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
1843       FuncInfo->setSRetReturnReg(Reg);
1844       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
1845       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1846       break;
1847     }
1848   }
1849 
1850   unsigned StackSize = CCInfo.getStackSize();
1851   // Align stack specially for tail calls.
1852   if (shouldGuaranteeTCO(CallConv,
1853                          MF.getTarget().Options.GuaranteedTailCallOpt))
1854     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1855 
1856   if (IsVarArg)
1857     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1858         .lowerVarArgsParameters(Chain, StackSize);
1859 
1860   // Some CCs need callee pop.
1861   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
1862                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
1863     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1864   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
1865     // X86 interrupts must pop the error code (and the alignment padding) if
1866     // present.
1867     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
1868   } else {
1869     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1870     // If this is an sret function, the return should pop the hidden pointer.
1871     if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
1872       FuncInfo->setBytesToPopOnReturn(4);
1873   }
1874 
1875   if (!Is64Bit) {
1876     // RegSaveFrameIndex is X86-64 only.
1877     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1878   }
1879 
1880   FuncInfo->setArgumentStackSize(StackSize);
1881 
1882   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1883     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
1884     if (Personality == EHPersonality::CoreCLR) {
1885       assert(Is64Bit);
1886       // TODO: Add a mechanism to frame lowering that will allow us to indicate
1887       // that we'd prefer this slot be allocated towards the bottom of the frame
1888       // (i.e. near the stack pointer after allocating the frame).  Every
1889       // funclet needs a copy of this slot in its (mostly empty) frame, and the
1890       // offset from the bottom of this and each funclet's frame must be the
1891       // same, so the size of funclets' (mostly empty) frames is dictated by
1892       // how far this slot is from the bottom (since they allocate just enough
1893       // space to accommodate holding this slot at the correct offset).
1894       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
1895       EHInfo->PSPSymFrameIdx = PSPSymFI;
1896     }
1897   }
1898 
1899   if (shouldDisableArgRegFromCSR(CallConv) ||
1900       F.hasFnAttribute("no_caller_saved_registers")) {
1901     MachineRegisterInfo &MRI = MF.getRegInfo();
1902     for (std::pair<Register, Register> Pair : MRI.liveins())
1903       MRI.disableCalleeSavedRegister(Pair.first);
1904   }
1905 
1906   return Chain;
1907 }
1908 
1909 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1910                                             SDValue Arg, const SDLoc &dl,
1911                                             SelectionDAG &DAG,
1912                                             const CCValAssign &VA,
1913                                             ISD::ArgFlagsTy Flags,
1914                                             bool isByVal) const {
1915   unsigned LocMemOffset = VA.getLocMemOffset();
1916   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1917   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1918                        StackPtr, PtrOff);
1919   if (isByVal)
1920     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1921 
1922   MaybeAlign Alignment;
1923   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1924       Arg.getSimpleValueType() != MVT::f80)
1925     Alignment = MaybeAlign(4);
1926   return DAG.getStore(
1927       Chain, dl, Arg, PtrOff,
1928       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
1929       Alignment);
1930 }
1931 
1932 /// Emit a load of return address if tail call
1933 /// optimization is performed and it is required.
1934 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1935     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1936     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1937   // Adjust the Return address stack slot.
1938   EVT VT = getPointerTy(DAG.getDataLayout());
1939   OutRetAddr = getReturnAddressFrameIndex(DAG);
1940 
1941   // Load the "old" Return address.
1942   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
1943   return SDValue(OutRetAddr.getNode(), 1);
1944 }
1945 
1946 /// Emit a store of the return address if tail call
1947 /// optimization is performed and it is required (FPDiff!=0).
1948 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1949                                         SDValue Chain, SDValue RetAddrFrIdx,
1950                                         EVT PtrVT, unsigned SlotSize,
1951                                         int FPDiff, const SDLoc &dl) {
1952   // Store the return address to the appropriate stack slot.
1953   if (!FPDiff) return Chain;
1954   // Calculate the new stack slot for the return address.
1955   int NewReturnAddrFI =
1956     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
1957                                          false);
1958   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
1959   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1960                        MachinePointerInfo::getFixedStack(
1961                            DAG.getMachineFunction(), NewReturnAddrFI));
1962   return Chain;
1963 }
1964 
1965 /// Returns a vector_shuffle mask for an movs{s|d}, movd
1966 /// operation of specified width.
1967 SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
1968                                    SDValue V1, SDValue V2) const {
1969   unsigned NumElems = VT.getVectorNumElements();
1970   SmallVector<int, 8> Mask;
1971   Mask.push_back(NumElems);
1972   for (unsigned i = 1; i != NumElems; ++i)
1973     Mask.push_back(i);
1974   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
1975 }
1976 
1977 SDValue
1978 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1979                              SmallVectorImpl<SDValue> &InVals) const {
1980   SelectionDAG &DAG                     = CLI.DAG;
1981   SDLoc &dl                             = CLI.DL;
1982   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1983   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1984   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1985   SDValue Chain                         = CLI.Chain;
1986   SDValue Callee                        = CLI.Callee;
1987   CallingConv::ID CallConv              = CLI.CallConv;
1988   bool &isTailCall                      = CLI.IsTailCall;
1989   bool isVarArg                         = CLI.IsVarArg;
1990   const auto *CB                        = CLI.CB;
1991 
1992   MachineFunction &MF = DAG.getMachineFunction();
1993   bool Is64Bit        = Subtarget.is64Bit();
1994   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
1995   bool IsSibcall      = false;
1996   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
1997       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
1998   bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
1999   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2000   bool HasNCSR = (CB && isa<CallInst>(CB) &&
2001                   CB->hasFnAttr("no_caller_saved_registers"));
2002   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
2003   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
2004   bool IsCFICall = IsIndirectCall && CLI.CFIType;
2005   const Module *M = MF.getMMI().getModule();
2006   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
2007 
2008   MachineFunction::CallSiteInfo CSInfo;
2009   if (CallConv == CallingConv::X86_INTR)
2010     report_fatal_error("X86 interrupts may not be called directly");
2011 
2012   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2013   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
2014     // If we are using a GOT, disable tail calls to external symbols with
2015     // default visibility. Tail calling such a symbol requires using a GOT
2016     // relocation, which forces early binding of the symbol. This breaks code
2017     // that require lazy function symbol resolution. Using musttail or
2018     // GuaranteedTailCallOpt will override this.
2019     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2020     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2021                G->getGlobal()->hasDefaultVisibility()))
2022       isTailCall = false;
2023   }
2024 
2025   if (isTailCall && !IsMustTail) {
2026     // Check if it's really possible to do a tail call.
2027     isTailCall = IsEligibleForTailCallOptimization(
2028         Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
2029         Ins, DAG);
2030 
2031     // Sibcalls are automatically detected tailcalls which do not require
2032     // ABI changes.
2033     if (!IsGuaranteeTCO && isTailCall)
2034       IsSibcall = true;
2035 
2036     if (isTailCall)
2037       ++NumTailCalls;
2038   }
2039 
2040   if (IsMustTail && !isTailCall)
2041     report_fatal_error("failed to perform tail call elimination on a call "
2042                        "site marked musttail");
2043 
2044   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2045          "Var args not supported with calling convention fastcc, ghc or hipe");
2046 
2047   // Analyze operands of the call, assigning locations to each operand.
2048   SmallVector<CCValAssign, 16> ArgLocs;
2049   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2050 
2051   // Allocate shadow area for Win64.
2052   if (IsWin64)
2053     CCInfo.AllocateStack(32, Align(8));
2054 
2055   CCInfo.AnalyzeArguments(Outs, CC_X86);
2056 
2057   // In vectorcall calling convention a second pass is required for the HVA
2058   // types.
2059   if (CallingConv::X86_VectorCall == CallConv) {
2060     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
2061   }
2062 
2063   // Get a count of how many bytes are to be pushed on the stack.
2064   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2065   if (IsSibcall)
2066     // This is a sibcall. The memory operands are available in caller's
2067     // own caller's stack.
2068     NumBytes = 0;
2069   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
2070     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2071 
2072   int FPDiff = 0;
2073   if (isTailCall &&
2074       shouldGuaranteeTCO(CallConv,
2075                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
2076     // Lower arguments at fp - stackoffset + fpdiff.
2077     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2078 
2079     FPDiff = NumBytesCallerPushed - NumBytes;
2080 
2081     // Set the delta of movement of the returnaddr stackslot.
2082     // But only set if delta is greater than previous delta.
2083     if (FPDiff < X86Info->getTCReturnAddrDelta())
2084       X86Info->setTCReturnAddrDelta(FPDiff);
2085   }
2086 
2087   unsigned NumBytesToPush = NumBytes;
2088   unsigned NumBytesToPop = NumBytes;
2089 
2090   // If we have an inalloca argument, all stack space has already been allocated
2091   // for us and be right at the top of the stack.  We don't support multiple
2092   // arguments passed in memory when using inalloca.
2093   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2094     NumBytesToPush = 0;
2095     if (!ArgLocs.back().isMemLoc())
2096       report_fatal_error("cannot use inalloca attribute on a register "
2097                          "parameter");
2098     if (ArgLocs.back().getLocMemOffset() != 0)
2099       report_fatal_error("any parameter with the inalloca attribute must be "
2100                          "the only memory argument");
2101   } else if (CLI.IsPreallocated) {
2102     assert(ArgLocs.back().isMemLoc() &&
2103            "cannot use preallocated attribute on a register "
2104            "parameter");
2105     SmallVector<size_t, 4> PreallocatedOffsets;
2106     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
2107       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
2108         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
2109       }
2110     }
2111     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2112     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
2113     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
2114     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
2115     NumBytesToPush = 0;
2116   }
2117 
2118   if (!IsSibcall && !IsMustTail)
2119     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
2120                                  NumBytes - NumBytesToPush, dl);
2121 
2122   SDValue RetAddrFrIdx;
2123   // Load return address for tail calls.
2124   if (isTailCall && FPDiff)
2125     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2126                                     Is64Bit, FPDiff, dl);
2127 
2128   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
2129   SmallVector<SDValue, 8> MemOpChains;
2130   SDValue StackPtr;
2131 
2132   // The next loop assumes that the locations are in the same order of the
2133   // input arguments.
2134   assert(isSortedByValueNo(ArgLocs) &&
2135          "Argument Location list must be sorted before lowering");
2136 
2137   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2138   // of tail call optimization arguments are handle later.
2139   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2140   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
2141        ++I, ++OutIndex) {
2142     assert(OutIndex < Outs.size() && "Invalid Out index");
2143     // Skip inalloca/preallocated arguments, they have already been written.
2144     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
2145     if (Flags.isInAlloca() || Flags.isPreallocated())
2146       continue;
2147 
2148     CCValAssign &VA = ArgLocs[I];
2149     EVT RegVT = VA.getLocVT();
2150     SDValue Arg = OutVals[OutIndex];
2151     bool isByVal = Flags.isByVal();
2152 
2153     // Promote the value if needed.
2154     switch (VA.getLocInfo()) {
2155     default: llvm_unreachable("Unknown loc info!");
2156     case CCValAssign::Full: break;
2157     case CCValAssign::SExt:
2158       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2159       break;
2160     case CCValAssign::ZExt:
2161       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2162       break;
2163     case CCValAssign::AExt:
2164       if (Arg.getValueType().isVector() &&
2165           Arg.getValueType().getVectorElementType() == MVT::i1)
2166         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
2167       else if (RegVT.is128BitVector()) {
2168         // Special case: passing MMX values in XMM registers.
2169         Arg = DAG.getBitcast(MVT::i64, Arg);
2170         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2171         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2172       } else
2173         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2174       break;
2175     case CCValAssign::BCvt:
2176       Arg = DAG.getBitcast(RegVT, Arg);
2177       break;
2178     case CCValAssign::Indirect: {
2179       if (isByVal) {
2180         // Memcpy the argument to a temporary stack slot to prevent
2181         // the caller from seeing any modifications the callee may make
2182         // as guaranteed by the `byval` attribute.
2183         int FrameIdx = MF.getFrameInfo().CreateStackObject(
2184             Flags.getByValSize(),
2185             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
2186         SDValue StackSlot =
2187             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
2188         Chain =
2189             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
2190         // From now on treat this as a regular pointer
2191         Arg = StackSlot;
2192         isByVal = false;
2193       } else {
2194         // Store the argument.
2195         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2196         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2197         Chain = DAG.getStore(
2198             Chain, dl, Arg, SpillSlot,
2199             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2200         Arg = SpillSlot;
2201       }
2202       break;
2203     }
2204     }
2205 
2206     if (VA.needsCustom()) {
2207       assert(VA.getValVT() == MVT::v64i1 &&
2208              "Currently the only custom case is when we split v64i1 to 2 regs");
2209       // Split v64i1 value into two registers
2210       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
2211     } else if (VA.isRegLoc()) {
2212       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2213       const TargetOptions &Options = DAG.getTarget().Options;
2214       if (Options.EmitCallSiteInfo)
2215         CSInfo.emplace_back(VA.getLocReg(), I);
2216       if (isVarArg && IsWin64) {
2217         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2218         // shadow reg if callee is a varargs function.
2219         Register ShadowReg;
2220         switch (VA.getLocReg()) {
2221         case X86::XMM0: ShadowReg = X86::RCX; break;
2222         case X86::XMM1: ShadowReg = X86::RDX; break;
2223         case X86::XMM2: ShadowReg = X86::R8; break;
2224         case X86::XMM3: ShadowReg = X86::R9; break;
2225         }
2226         if (ShadowReg)
2227           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2228       }
2229     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2230       assert(VA.isMemLoc());
2231       if (!StackPtr.getNode())
2232         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2233                                       getPointerTy(DAG.getDataLayout()));
2234       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2235                                              dl, DAG, VA, Flags, isByVal));
2236     }
2237   }
2238 
2239   if (!MemOpChains.empty())
2240     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2241 
2242   if (Subtarget.isPICStyleGOT()) {
2243     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2244     // GOT pointer (except regcall).
2245     if (!isTailCall) {
2246       // Indirect call with RegCall calling convertion may use up all the
2247       // general registers, so it is not suitable to bind EBX reister for
2248       // GOT address, just let register allocator handle it.
2249       if (CallConv != CallingConv::X86_RegCall)
2250         RegsToPass.push_back(std::make_pair(
2251           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2252                                           getPointerTy(DAG.getDataLayout()))));
2253     } else {
2254       // If we are tail calling and generating PIC/GOT style code load the
2255       // address of the callee into ECX. The value in ecx is used as target of
2256       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2257       // for tail calls on PIC/GOT architectures. Normally we would just put the
2258       // address of GOT into ebx and then call target@PLT. But for tail calls
2259       // ebx would be restored (since ebx is callee saved) before jumping to the
2260       // target@PLT.
2261 
2262       // Note: The actual moving to ECX is done further down.
2263       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2264       if (G && !G->getGlobal()->hasLocalLinkage() &&
2265           G->getGlobal()->hasDefaultVisibility())
2266         Callee = LowerGlobalAddress(Callee, DAG);
2267       else if (isa<ExternalSymbolSDNode>(Callee))
2268         Callee = LowerExternalSymbol(Callee, DAG);
2269     }
2270   }
2271 
2272   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2273       (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
2274     // From AMD64 ABI document:
2275     // For calls that may call functions that use varargs or stdargs
2276     // (prototype-less calls or calls to functions containing ellipsis (...) in
2277     // the declaration) %al is used as hidden argument to specify the number
2278     // of SSE registers used. The contents of %al do not need to match exactly
2279     // the number of registers, but must be an ubound on the number of SSE
2280     // registers used and is in the range 0 - 8 inclusive.
2281 
2282     // Count the number of XMM registers allocated.
2283     static const MCPhysReg XMMArgRegs[] = {
2284       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2285       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2286     };
2287     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
2288     assert((Subtarget.hasSSE1() || !NumXMMRegs)
2289            && "SSE registers cannot be used when SSE is disabled");
2290     RegsToPass.push_back(std::make_pair(Register(X86::AL),
2291                                         DAG.getConstant(NumXMMRegs, dl,
2292                                                         MVT::i8)));
2293   }
2294 
2295   if (isVarArg && IsMustTail) {
2296     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2297     for (const auto &F : Forwards) {
2298       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2299       RegsToPass.push_back(std::make_pair(F.PReg, Val));
2300     }
2301   }
2302 
2303   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
2304   // don't need this because the eligibility check rejects calls that require
2305   // shuffling arguments passed in memory.
2306   if (!IsSibcall && isTailCall) {
2307     // Force all the incoming stack arguments to be loaded from the stack
2308     // before any new outgoing arguments are stored to the stack, because the
2309     // outgoing stack slots may alias the incoming argument stack slots, and
2310     // the alias isn't otherwise explicit. This is slightly more conservative
2311     // than necessary, because it means that each store effectively depends
2312     // on every argument instead of just those arguments it would clobber.
2313     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2314 
2315     SmallVector<SDValue, 8> MemOpChains2;
2316     SDValue FIN;
2317     int FI = 0;
2318     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
2319          ++I, ++OutsIndex) {
2320       CCValAssign &VA = ArgLocs[I];
2321 
2322       if (VA.isRegLoc()) {
2323         if (VA.needsCustom()) {
2324           assert((CallConv == CallingConv::X86_RegCall) &&
2325                  "Expecting custom case only in regcall calling convention");
2326           // This means that we are in special case where one argument was
2327           // passed through two register locations - Skip the next location
2328           ++I;
2329         }
2330 
2331         continue;
2332       }
2333 
2334       assert(VA.isMemLoc());
2335       SDValue Arg = OutVals[OutsIndex];
2336       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
2337       // Skip inalloca/preallocated arguments.  They don't require any work.
2338       if (Flags.isInAlloca() || Flags.isPreallocated())
2339         continue;
2340       // Create frame index.
2341       int32_t Offset = VA.getLocMemOffset()+FPDiff;
2342       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2343       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
2344       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2345 
2346       if (Flags.isByVal()) {
2347         // Copy relative to framepointer.
2348         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
2349         if (!StackPtr.getNode())
2350           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2351                                         getPointerTy(DAG.getDataLayout()));
2352         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2353                              StackPtr, Source);
2354 
2355         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2356                                                          ArgChain,
2357                                                          Flags, DAG, dl));
2358       } else {
2359         // Store relative to framepointer.
2360         MemOpChains2.push_back(DAG.getStore(
2361             ArgChain, dl, Arg, FIN,
2362             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
2363       }
2364     }
2365 
2366     if (!MemOpChains2.empty())
2367       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
2368 
2369     // Store the return address to the appropriate stack slot.
2370     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2371                                      getPointerTy(DAG.getDataLayout()),
2372                                      RegInfo->getSlotSize(), FPDiff, dl);
2373   }
2374 
2375   // Build a sequence of copy-to-reg nodes chained together with token chain
2376   // and glue operands which copy the outgoing args into registers.
2377   SDValue InGlue;
2378   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2379     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2380                              RegsToPass[i].second, InGlue);
2381     InGlue = Chain.getValue(1);
2382   }
2383 
2384   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2385     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2386     // In the 64-bit large code model, we have to make all calls
2387     // through a register, since the call instruction's 32-bit
2388     // pc-relative offset may not be large enough to hold the whole
2389     // address.
2390   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
2391              Callee->getOpcode() == ISD::ExternalSymbol) {
2392     // Lower direct calls to global addresses and external symbols. Setting
2393     // ForCall to true here has the effect of removing WrapperRIP when possible
2394     // to allow direct calls to be selected without first materializing the
2395     // address into a register.
2396     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
2397   } else if (Subtarget.isTarget64BitILP32() &&
2398              Callee.getValueType() == MVT::i32) {
2399     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2400     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
2401   }
2402 
2403   // Returns a chain & a glue for retval copy to use.
2404   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2405   SmallVector<SDValue, 8> Ops;
2406 
2407   if (!IsSibcall && isTailCall && !IsMustTail) {
2408     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
2409     InGlue = Chain.getValue(1);
2410   }
2411 
2412   Ops.push_back(Chain);
2413   Ops.push_back(Callee);
2414 
2415   if (isTailCall)
2416     Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
2417 
2418   // Add argument registers to the end of the list so that they are known live
2419   // into the call.
2420   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2421     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2422                                   RegsToPass[i].second.getValueType()));
2423 
2424   // Add a register mask operand representing the call-preserved registers.
2425   const uint32_t *Mask = [&]() {
2426     auto AdaptedCC = CallConv;
2427     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2428     // use X86_INTR calling convention because it has the same CSR mask
2429     // (same preserved registers).
2430     if (HasNCSR)
2431       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2432     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
2433     // to use the CSR_NoRegs_RegMask.
2434     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
2435       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2436     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2437   }();
2438   assert(Mask && "Missing call preserved mask for calling convention");
2439 
2440   // If this is an invoke in a 32-bit function using a funclet-based
2441   // personality, assume the function clobbers all registers. If an exception
2442   // is thrown, the runtime will not restore CSRs.
2443   // FIXME: Model this more precisely so that we can register allocate across
2444   // the normal edge and spill and fill across the exceptional edge.
2445   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
2446     const Function &CallerFn = MF.getFunction();
2447     EHPersonality Pers =
2448         CallerFn.hasPersonalityFn()
2449             ? classifyEHPersonality(CallerFn.getPersonalityFn())
2450             : EHPersonality::Unknown;
2451     if (isFuncletEHPersonality(Pers))
2452       Mask = RegInfo->getNoPreservedMask();
2453   }
2454 
2455   // Define a new register mask from the existing mask.
2456   uint32_t *RegMask = nullptr;
2457 
2458   // In some calling conventions we need to remove the used physical registers
2459   // from the reg mask. Create a new RegMask for such calling conventions.
2460   // RegMask for calling conventions that disable only return registers (e.g.
2461   // preserve_most) will be modified later in LowerCallResult.
2462   bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
2463   if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
2464     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2465 
2466     // Allocate a new Reg Mask and copy Mask.
2467     RegMask = MF.allocateRegMask();
2468     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
2469     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
2470 
2471     // Make sure all sub registers of the argument registers are reset
2472     // in the RegMask.
2473     if (ShouldDisableArgRegs) {
2474       for (auto const &RegPair : RegsToPass)
2475         for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
2476           RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
2477     }
2478 
2479     // Create the RegMask Operand according to our updated mask.
2480     Ops.push_back(DAG.getRegisterMask(RegMask));
2481   } else {
2482     // Create the RegMask Operand according to the static mask.
2483     Ops.push_back(DAG.getRegisterMask(Mask));
2484   }
2485 
2486   if (InGlue.getNode())
2487     Ops.push_back(InGlue);
2488 
2489   if (isTailCall) {
2490     // We used to do:
2491     //// If this is the first return lowered for this function, add the regs
2492     //// to the liveout set for the function.
2493     // This isn't right, although it's probably harmless on x86; liveouts
2494     // should be computed from returns not tail calls.  Consider a void
2495     // function making a tail call to a function returning int.
2496     MF.getFrameInfo().setHasTailCall();
2497     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
2498 
2499     if (IsCFICall)
2500       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2501 
2502     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2503     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2504     return Ret;
2505   }
2506 
2507   if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
2508     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
2509   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
2510     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
2511     // expanded to the call, directly followed by a special marker sequence and
2512     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2513     assert(!isTailCall &&
2514            "tail calls cannot be marked with clang.arc.attachedcall");
2515     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2516 
2517     // Add a target global address for the retainRV/claimRV runtime function
2518     // just before the call target.
2519     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
2520     auto PtrVT = getPointerTy(DAG.getDataLayout());
2521     auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
2522     Ops.insert(Ops.begin() + 1, GA);
2523     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
2524   } else {
2525     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
2526   }
2527 
2528   if (IsCFICall)
2529     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2530 
2531   InGlue = Chain.getValue(1);
2532   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2533   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2534 
2535   // Save heapallocsite metadata.
2536   if (CLI.CB)
2537     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
2538       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
2539 
2540   // Create the CALLSEQ_END node.
2541   unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
2542   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2543                        DAG.getTarget().Options.GuaranteedTailCallOpt))
2544     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
2545   else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
2546     // If this call passes a struct-return pointer, the callee
2547     // pops that struct pointer.
2548     NumBytesForCalleeToPop = 4;
2549 
2550   // Returns a glue for retval copy to use.
2551   if (!IsSibcall) {
2552     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
2553                                InGlue, dl);
2554     InGlue = Chain.getValue(1);
2555   }
2556 
2557   // Handle result values, copying them out of physregs into vregs that we
2558   // return.
2559   return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2560                          InVals, RegMask);
2561 }
2562 
2563 //===----------------------------------------------------------------------===//
2564 //                Fast Calling Convention (tail call) implementation
2565 //===----------------------------------------------------------------------===//
2566 
2567 //  Like std call, callee cleans arguments, convention except that ECX is
2568 //  reserved for storing the tail called function address. Only 2 registers are
2569 //  free for argument passing (inreg). Tail call optimization is performed
2570 //  provided:
2571 //                * tailcallopt is enabled
2572 //                * caller/callee are fastcc
2573 //  On X86_64 architecture with GOT-style position independent code only local
2574 //  (within module) calls are supported at the moment.
2575 //  To keep the stack aligned according to platform abi the function
2576 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2577 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2578 //  If a tail called function callee has more arguments than the caller the
2579 //  caller needs to make sure that there is room to move the RETADDR to. This is
2580 //  achieved by reserving an area the size of the argument delta right after the
2581 //  original RETADDR, but before the saved framepointer or the spilled registers
2582 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2583 //  stack layout:
2584 //    arg1
2585 //    arg2
2586 //    RETADDR
2587 //    [ new RETADDR
2588 //      move area ]
2589 //    (possible EBP)
2590 //    ESI
2591 //    EDI
2592 //    local1 ..
2593 
2594 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2595 /// requirement.
2596 unsigned
2597 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2598                                                SelectionDAG &DAG) const {
2599   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2600   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2601   assert(StackSize % SlotSize == 0 &&
2602          "StackSize must be a multiple of SlotSize");
2603   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
2604 }
2605 
2606 /// Return true if the given stack call argument is already available in the
2607 /// same position (relatively) of the caller's incoming argument stack.
2608 static
2609 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2610                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2611                          const X86InstrInfo *TII, const CCValAssign &VA) {
2612   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2613 
2614   for (;;) {
2615     // Look through nodes that don't alter the bits of the incoming value.
2616     unsigned Op = Arg.getOpcode();
2617     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
2618         Op == ISD::AssertZext) {
2619       Arg = Arg.getOperand(0);
2620       continue;
2621     }
2622     if (Op == ISD::TRUNCATE) {
2623       const SDValue &TruncInput = Arg.getOperand(0);
2624       if (TruncInput.getOpcode() == ISD::AssertZext &&
2625           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
2626               Arg.getValueType()) {
2627         Arg = TruncInput.getOperand(0);
2628         continue;
2629       }
2630     }
2631     break;
2632   }
2633 
2634   int FI = INT_MAX;
2635   if (Arg.getOpcode() == ISD::CopyFromReg) {
2636     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2637     if (!VR.isVirtual())
2638       return false;
2639     MachineInstr *Def = MRI->getVRegDef(VR);
2640     if (!Def)
2641       return false;
2642     if (!Flags.isByVal()) {
2643       if (!TII->isLoadFromStackSlot(*Def, FI))
2644         return false;
2645     } else {
2646       unsigned Opcode = Def->getOpcode();
2647       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
2648            Opcode == X86::LEA64_32r) &&
2649           Def->getOperand(1).isFI()) {
2650         FI = Def->getOperand(1).getIndex();
2651         Bytes = Flags.getByValSize();
2652       } else
2653         return false;
2654     }
2655   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2656     if (Flags.isByVal())
2657       // ByVal argument is passed in as a pointer but it's now being
2658       // dereferenced. e.g.
2659       // define @foo(%struct.X* %A) {
2660       //   tail call @bar(%struct.X* byval %A)
2661       // }
2662       return false;
2663     SDValue Ptr = Ld->getBasePtr();
2664     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2665     if (!FINode)
2666       return false;
2667     FI = FINode->getIndex();
2668   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2669     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2670     FI = FINode->getIndex();
2671     Bytes = Flags.getByValSize();
2672   } else
2673     return false;
2674 
2675   assert(FI != INT_MAX);
2676   if (!MFI.isFixedObjectIndex(FI))
2677     return false;
2678 
2679   if (Offset != MFI.getObjectOffset(FI))
2680     return false;
2681 
2682   // If this is not byval, check that the argument stack object is immutable.
2683   // inalloca and argument copy elision can create mutable argument stack
2684   // objects. Byval objects can be mutated, but a byval call intends to pass the
2685   // mutated memory.
2686   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
2687     return false;
2688 
2689   if (VA.getLocVT().getFixedSizeInBits() >
2690       Arg.getValueSizeInBits().getFixedValue()) {
2691     // If the argument location is wider than the argument type, check that any
2692     // extension flags match.
2693     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
2694         Flags.isSExt() != MFI.isObjectSExt(FI)) {
2695       return false;
2696     }
2697   }
2698 
2699   return Bytes == MFI.getObjectSize(FI);
2700 }
2701 
2702 /// Check whether the call is eligible for tail call optimization. Targets
2703 /// that want to do tail call optimization should implement this function.
2704 bool X86TargetLowering::IsEligibleForTailCallOptimization(
2705     SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
2706     bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
2707     const SmallVectorImpl<SDValue> &OutVals,
2708     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2709   if (!mayTailCallThisCC(CalleeCC))
2710     return false;
2711 
2712   // If -tailcallopt is specified, make fastcc functions tail-callable.
2713   MachineFunction &MF = DAG.getMachineFunction();
2714   const Function &CallerF = MF.getFunction();
2715 
2716   // If the function return type is x86_fp80 and the callee return type is not,
2717   // then the FP_EXTEND of the call result is not a nop. It's not safe to
2718   // perform a tailcall optimization here.
2719   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
2720     return false;
2721 
2722   CallingConv::ID CallerCC = CallerF.getCallingConv();
2723   bool CCMatch = CallerCC == CalleeCC;
2724   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
2725   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
2726   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
2727       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
2728 
2729   // Win64 functions have extra shadow space for argument homing. Don't do the
2730   // sibcall if the caller and callee have mismatched expectations for this
2731   // space.
2732   if (IsCalleeWin64 != IsCallerWin64)
2733     return false;
2734 
2735   if (IsGuaranteeTCO) {
2736     if (canGuaranteeTCO(CalleeCC) && CCMatch)
2737       return true;
2738     return false;
2739   }
2740 
2741   // Look for obvious safe cases to perform tail call optimization that do not
2742   // require ABI changes. This is what gcc calls sibcall.
2743 
2744   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2745   // emit a special epilogue.
2746   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2747   if (RegInfo->hasStackRealignment(MF))
2748     return false;
2749 
2750   // Also avoid sibcall optimization if we're an sret return fn and the callee
2751   // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
2752   // insufficient.
2753   if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
2754     // For a compatible tail call the callee must return our sret pointer. So it
2755     // needs to be (a) an sret function itself and (b) we pass our sret as its
2756     // sret. Condition #b is harder to determine.
2757     return false;
2758   } else if (IsCalleePopSRet)
2759     // The callee pops an sret, so we cannot tail-call, as our caller doesn't
2760     // expect that.
2761     return false;
2762 
2763   // Do not sibcall optimize vararg calls unless all arguments are passed via
2764   // registers.
2765   LLVMContext &C = *DAG.getContext();
2766   if (isVarArg && !Outs.empty()) {
2767     // Optimizing for varargs on Win64 is unlikely to be safe without
2768     // additional testing.
2769     if (IsCalleeWin64 || IsCallerWin64)
2770       return false;
2771 
2772     SmallVector<CCValAssign, 16> ArgLocs;
2773     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2774     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2775     for (const auto &VA : ArgLocs)
2776       if (!VA.isRegLoc())
2777         return false;
2778   }
2779 
2780   // If the call result is in ST0 / ST1, it needs to be popped off the x87
2781   // stack.  Therefore, if it's not used by the call it is not safe to optimize
2782   // this into a sibcall.
2783   bool Unused = false;
2784   for (const auto &In : Ins) {
2785     if (!In.Used) {
2786       Unused = true;
2787       break;
2788     }
2789   }
2790   if (Unused) {
2791     SmallVector<CCValAssign, 16> RVLocs;
2792     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
2793     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2794     for (const auto &VA : RVLocs) {
2795       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
2796         return false;
2797     }
2798   }
2799 
2800   // Check that the call results are passed in the same way.
2801   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2802                                   RetCC_X86, RetCC_X86))
2803     return false;
2804   // The callee has to preserve all registers the caller needs to preserve.
2805   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2806   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2807   if (!CCMatch) {
2808     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2809     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2810       return false;
2811   }
2812 
2813   unsigned StackArgsSize = 0;
2814 
2815   // If the callee takes no arguments then go on to check the results of the
2816   // call.
2817   if (!Outs.empty()) {
2818     // Check if stack adjustment is needed. For now, do not do this if any
2819     // argument is passed on the stack.
2820     SmallVector<CCValAssign, 16> ArgLocs;
2821     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2822 
2823     // Allocate shadow area for Win64
2824     if (IsCalleeWin64)
2825       CCInfo.AllocateStack(32, Align(8));
2826 
2827     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2828     StackArgsSize = CCInfo.getStackSize();
2829 
2830     if (CCInfo.getStackSize()) {
2831       // Check if the arguments are already laid out in the right way as
2832       // the caller's fixed stack objects.
2833       MachineFrameInfo &MFI = MF.getFrameInfo();
2834       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2835       const X86InstrInfo *TII = Subtarget.getInstrInfo();
2836       for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
2837         const CCValAssign &VA = ArgLocs[I];
2838         SDValue Arg = OutVals[I];
2839         ISD::ArgFlagsTy Flags = Outs[I].Flags;
2840         if (VA.getLocInfo() == CCValAssign::Indirect)
2841           return false;
2842         if (!VA.isRegLoc()) {
2843           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
2844                                    TII, VA))
2845             return false;
2846         }
2847       }
2848     }
2849 
2850     bool PositionIndependent = isPositionIndependent();
2851     // If the tailcall address may be in a register, then make sure it's
2852     // possible to register allocate for it. In 32-bit, the call address can
2853     // only target EAX, EDX, or ECX since the tail call must be scheduled after
2854     // callee-saved registers are restored. These happen to be the same
2855     // registers used to pass 'inreg' arguments so watch out for those.
2856     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
2857                                   !isa<ExternalSymbolSDNode>(Callee)) ||
2858                                  PositionIndependent)) {
2859       unsigned NumInRegs = 0;
2860       // In PIC we need an extra register to formulate the address computation
2861       // for the callee.
2862       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
2863 
2864       for (const auto &VA : ArgLocs) {
2865         if (!VA.isRegLoc())
2866           continue;
2867         Register Reg = VA.getLocReg();
2868         switch (Reg) {
2869         default: break;
2870         case X86::EAX: case X86::EDX: case X86::ECX:
2871           if (++NumInRegs == MaxInRegs)
2872             return false;
2873           break;
2874         }
2875       }
2876     }
2877 
2878     const MachineRegisterInfo &MRI = MF.getRegInfo();
2879     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2880       return false;
2881   }
2882 
2883   bool CalleeWillPop =
2884       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
2885                        MF.getTarget().Options.GuaranteedTailCallOpt);
2886 
2887   if (unsigned BytesToPop =
2888           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
2889     // If we have bytes to pop, the callee must pop them.
2890     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
2891     if (!CalleePopMatches)
2892       return false;
2893   } else if (CalleeWillPop && StackArgsSize > 0) {
2894     // If we don't have bytes to pop, make sure the callee doesn't pop any.
2895     return false;
2896   }
2897 
2898   return true;
2899 }
2900 
2901 /// Determines whether the callee is required to pop its own arguments.
2902 /// Callee pop is necessary to support tail calls.
2903 bool X86::isCalleePop(CallingConv::ID CallingConv,
2904                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
2905   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
2906   // can guarantee TCO.
2907   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
2908     return true;
2909 
2910   switch (CallingConv) {
2911   default:
2912     return false;
2913   case CallingConv::X86_StdCall:
2914   case CallingConv::X86_FastCall:
2915   case CallingConv::X86_ThisCall:
2916   case CallingConv::X86_VectorCall:
2917     return !is64Bit;
2918   }
2919 }
2920