1 //===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file implements the lowering of LLVM calls to DAG nodes.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "MCTargetDesc/X86MCAsmInfo.h"
15 #include "X86.h"
16 #include "X86CallingConv.h"
17 #include "X86FrameLowering.h"
18 #include "X86ISelLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86TargetMachine.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/ObjCARCUtil.h"
24 #include "llvm/CodeGen/MachineJumpTableInfo.h"
25 #include "llvm/CodeGen/MachineModuleInfo.h"
26 #include "llvm/CodeGen/WinEHFuncInfo.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IRBuilder.h"
29 #include "llvm/IR/Module.h"
30
31 #define DEBUG_TYPE "x86-isel"
32
33 using namespace llvm;
34
35 STATISTIC(NumTailCalls, "Number of tail calls");
36
37 /// Call this when the user attempts to do something unsupported, like
38 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
39 /// report_fatal_error, so calling code should attempt to recover without
40 /// crashing.
errorUnsupported(SelectionDAG & DAG,const SDLoc & dl,const char * Msg)41 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
42 const char *Msg) {
43 MachineFunction &MF = DAG.getMachineFunction();
44 DAG.getContext()->diagnose(
45 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
46 }
47
48 /// Returns true if a CC can dynamically exclude a register from the list of
49 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
50 /// the return registers.
shouldDisableRetRegFromCSR(CallingConv::ID CC)51 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
52 switch (CC) {
53 default:
54 return false;
55 case CallingConv::X86_RegCall:
56 case CallingConv::PreserveMost:
57 case CallingConv::PreserveAll:
58 return true;
59 }
60 }
61
62 /// Returns true if a CC can dynamically exclude a register from the list of
63 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
64 /// the parameters.
shouldDisableArgRegFromCSR(CallingConv::ID CC)65 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
66 return CC == CallingConv::X86_RegCall;
67 }
68
69 static std::pair<MVT, unsigned>
handleMaskRegisterForCallingConv(unsigned NumElts,CallingConv::ID CC,const X86Subtarget & Subtarget)70 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
71 const X86Subtarget &Subtarget) {
72 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
73 // convention is one that uses k registers.
74 if (NumElts == 2)
75 return {MVT::v2i64, 1};
76 if (NumElts == 4)
77 return {MVT::v4i32, 1};
78 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
79 CC != CallingConv::Intel_OCL_BI)
80 return {MVT::v8i16, 1};
81 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
82 CC != CallingConv::Intel_OCL_BI)
83 return {MVT::v16i8, 1};
84 // v32i1 passes in ymm unless we have BWI and the calling convention is
85 // regcall.
86 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
87 return {MVT::v32i8, 1};
88 // Split v64i1 vectors if we don't have v64i8 available.
89 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
90 if (Subtarget.useAVX512Regs())
91 return {MVT::v64i8, 1};
92 return {MVT::v32i8, 2};
93 }
94
95 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
96 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
97 NumElts > 64)
98 return {MVT::i8, NumElts};
99
100 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
101 }
102
getRegisterTypeForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const103 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
104 CallingConv::ID CC,
105 EVT VT) const {
106 if (VT.isVector()) {
107 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
108 unsigned NumElts = VT.getVectorNumElements();
109
110 MVT RegisterVT;
111 unsigned NumRegisters;
112 std::tie(RegisterVT, NumRegisters) =
113 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
114 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
115 return RegisterVT;
116 }
117
118 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
119 return MVT::v8f16;
120 }
121
122 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
123 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
124 !Subtarget.hasX87())
125 return MVT::i32;
126
127 if (isTypeLegal(MVT::f16)) {
128 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
129 return getRegisterTypeForCallingConv(
130 Context, CC, VT.changeVectorElementType(MVT::f16));
131
132 if (VT == MVT::bf16)
133 return MVT::f16;
134 }
135
136 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
137 }
138
getNumRegistersForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const139 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
140 CallingConv::ID CC,
141 EVT VT) const {
142 if (VT.isVector()) {
143 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
144 unsigned NumElts = VT.getVectorNumElements();
145
146 MVT RegisterVT;
147 unsigned NumRegisters;
148 std::tie(RegisterVT, NumRegisters) =
149 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
150 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
151 return NumRegisters;
152 }
153
154 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
155 return 1;
156 }
157
158 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
159 // x87 is disabled.
160 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
161 if (VT == MVT::f64)
162 return 2;
163 if (VT == MVT::f80)
164 return 3;
165 }
166
167 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
168 isTypeLegal(MVT::f16))
169 return getNumRegistersForCallingConv(Context, CC,
170 VT.changeVectorElementType(MVT::f16));
171
172 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
173 }
174
getVectorTypeBreakdownForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT,EVT & IntermediateVT,unsigned & NumIntermediates,MVT & RegisterVT) const175 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
176 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
177 unsigned &NumIntermediates, MVT &RegisterVT) const {
178 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
179 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
180 Subtarget.hasAVX512() &&
181 (!isPowerOf2_32(VT.getVectorNumElements()) ||
182 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
183 VT.getVectorNumElements() > 64)) {
184 RegisterVT = MVT::i8;
185 IntermediateVT = MVT::i1;
186 NumIntermediates = VT.getVectorNumElements();
187 return NumIntermediates;
188 }
189
190 // Split v64i1 vectors if we don't have v64i8 available.
191 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
192 CC != CallingConv::X86_RegCall) {
193 RegisterVT = MVT::v32i8;
194 IntermediateVT = MVT::v32i1;
195 NumIntermediates = 2;
196 return 2;
197 }
198
199 // Split vNbf16 vectors according to vNf16.
200 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
201 isTypeLegal(MVT::f16))
202 VT = VT.changeVectorElementType(MVT::f16);
203
204 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
205 NumIntermediates, RegisterVT);
206 }
207
getSetCCResultType(const DataLayout & DL,LLVMContext & Context,EVT VT) const208 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
209 LLVMContext& Context,
210 EVT VT) const {
211 if (!VT.isVector())
212 return MVT::i8;
213
214 if (Subtarget.hasAVX512()) {
215 // Figure out what this type will be legalized to.
216 EVT LegalVT = VT;
217 while (getTypeAction(Context, LegalVT) != TypeLegal)
218 LegalVT = getTypeToTransformTo(Context, LegalVT);
219
220 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
221 if (LegalVT.getSimpleVT().is512BitVector())
222 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
223
224 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
225 // If we legalized to less than a 512-bit vector, then we will use a vXi1
226 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
227 // vXi16/vXi8.
228 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
229 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
230 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
231 }
232 }
233
234 return VT.changeVectorElementTypeToInteger();
235 }
236
functionArgumentNeedsConsecutiveRegisters(Type * Ty,CallingConv::ID CallConv,bool isVarArg,const DataLayout & DL) const237 bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
238 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
239 const DataLayout &DL) const {
240 // On x86-64 i128 is split into two i64s and needs to be allocated to two
241 // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
242 // is split to four i32s and never actually passed in registers, but we use
243 // the consecutive register mark to match it in TableGen.
244 if (Ty->isIntegerTy(128))
245 return true;
246
247 // On x86-32, fp128 acts the same as i128.
248 if (Subtarget.is32Bit() && Ty->isFP128Ty())
249 return true;
250
251 return false;
252 }
253
254 /// Helper for getByValTypeAlignment to determine
255 /// the desired ByVal argument alignment.
getMaxByValAlign(Type * Ty,Align & MaxAlign)256 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
257 if (MaxAlign == 16)
258 return;
259 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
260 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
261 MaxAlign = Align(16);
262 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
263 Align EltAlign;
264 getMaxByValAlign(ATy->getElementType(), EltAlign);
265 if (EltAlign > MaxAlign)
266 MaxAlign = EltAlign;
267 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
268 for (auto *EltTy : STy->elements()) {
269 Align EltAlign;
270 getMaxByValAlign(EltTy, EltAlign);
271 if (EltAlign > MaxAlign)
272 MaxAlign = EltAlign;
273 if (MaxAlign == 16)
274 break;
275 }
276 }
277 }
278
279 /// Return the desired alignment for ByVal aggregate
280 /// function arguments in the caller parameter area. For X86, aggregates
281 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
282 /// are at 4-byte boundaries.
getByValTypeAlignment(Type * Ty,const DataLayout & DL) const283 Align X86TargetLowering::getByValTypeAlignment(Type *Ty,
284 const DataLayout &DL) const {
285 if (Subtarget.is64Bit())
286 return std::max(DL.getABITypeAlign(Ty), Align::Constant<8>());
287
288 Align Alignment(4);
289 if (Subtarget.hasSSE1())
290 getMaxByValAlign(Ty, Alignment);
291 return Alignment;
292 }
293
294 /// It returns EVT::Other if the type should be determined using generic
295 /// target-independent logic.
296 /// For vector ops we check that the overall size isn't larger than our
297 /// preferred vector width.
getOptimalMemOpType(LLVMContext & Context,const MemOp & Op,const AttributeList & FuncAttributes) const298 EVT X86TargetLowering::getOptimalMemOpType(
299 LLVMContext &Context, const MemOp &Op,
300 const AttributeList &FuncAttributes) const {
301 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
302 if (Op.size() >= 16 &&
303 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
304 // FIXME: Check if unaligned 64-byte accesses are slow.
305 if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
306 (Subtarget.getPreferVectorWidth() >= 512)) {
307 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
308 }
309 // FIXME: Check if unaligned 32-byte accesses are slow.
310 if (Op.size() >= 32 && Subtarget.hasAVX() &&
311 Subtarget.useLight256BitInstructions()) {
312 // Although this isn't a well-supported type for AVX1, we'll let
313 // legalization and shuffle lowering produce the optimal codegen. If we
314 // choose an optimal type with a vector element larger than a byte,
315 // getMemsetStores() may create an intermediate splat (using an integer
316 // multiply) before we splat as a vector.
317 return MVT::v32i8;
318 }
319 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
320 return MVT::v16i8;
321 // TODO: Can SSE1 handle a byte vector?
322 // If we have SSE1 registers we should be able to use them.
323 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
324 (Subtarget.getPreferVectorWidth() >= 128))
325 return MVT::v4f32;
326 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
327 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
328 // Do not use f64 to lower memcpy if source is string constant. It's
329 // better to use i32 to avoid the loads.
330 // Also, do not use f64 to lower memset unless this is a memset of zeros.
331 // The gymnastics of splatting a byte value into an XMM register and then
332 // only using 8-byte stores (because this is a CPU with slow unaligned
333 // 16-byte accesses) makes that a loser.
334 return MVT::f64;
335 }
336 }
337 // This is a compromise. If we reach here, unaligned accesses may be slow on
338 // this target. However, creating smaller, aligned accesses could be even
339 // slower and would certainly be a lot more code.
340 if (Subtarget.is64Bit() && Op.size() >= 8)
341 return MVT::i64;
342 return MVT::i32;
343 }
344
isSafeMemOpType(MVT VT) const345 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
346 if (VT == MVT::f32)
347 return Subtarget.hasSSE1();
348 if (VT == MVT::f64)
349 return Subtarget.hasSSE2();
350 return true;
351 }
352
isBitAligned(Align Alignment,uint64_t SizeInBits)353 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
354 return (8 * Alignment.value()) % SizeInBits == 0;
355 }
356
isMemoryAccessFast(EVT VT,Align Alignment) const357 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
358 if (isBitAligned(Alignment, VT.getSizeInBits()))
359 return true;
360 switch (VT.getSizeInBits()) {
361 default:
362 // 8-byte and under are always assumed to be fast.
363 return true;
364 case 128:
365 return !Subtarget.isUnalignedMem16Slow();
366 case 256:
367 return !Subtarget.isUnalignedMem32Slow();
368 // TODO: What about AVX-512 (512-bit) accesses?
369 }
370 }
371
allowsMisalignedMemoryAccesses(EVT VT,unsigned,Align Alignment,MachineMemOperand::Flags Flags,unsigned * Fast) const372 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
373 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
374 unsigned *Fast) const {
375 if (Fast)
376 *Fast = isMemoryAccessFast(VT, Alignment);
377 // NonTemporal vector memory ops must be aligned.
378 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
379 // NT loads can only be vector aligned, so if its less aligned than the
380 // minimum vector size (which we can split the vector down to), we might as
381 // well use a regular unaligned vector load.
382 // We don't have any NT loads pre-SSE41.
383 if (!!(Flags & MachineMemOperand::MOLoad))
384 return (Alignment < 16 || !Subtarget.hasSSE41());
385 return false;
386 }
387 // Misaligned accesses of any size are always allowed.
388 return true;
389 }
390
allowsMemoryAccess(LLVMContext & Context,const DataLayout & DL,EVT VT,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,unsigned * Fast) const391 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
392 const DataLayout &DL, EVT VT,
393 unsigned AddrSpace, Align Alignment,
394 MachineMemOperand::Flags Flags,
395 unsigned *Fast) const {
396 if (Fast)
397 *Fast = isMemoryAccessFast(VT, Alignment);
398 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
399 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
400 /*Fast=*/nullptr))
401 return true;
402 // NonTemporal vector memory ops are special, and must be aligned.
403 if (!isBitAligned(Alignment, VT.getSizeInBits()))
404 return false;
405 switch (VT.getSizeInBits()) {
406 case 128:
407 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
408 return true;
409 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
410 return true;
411 return false;
412 case 256:
413 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
414 return true;
415 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
416 return true;
417 return false;
418 case 512:
419 if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
420 return true;
421 return false;
422 default:
423 return false; // Don't have NonTemporal vector memory ops of this size.
424 }
425 }
426 return true;
427 }
428
429 /// Return the entry encoding for a jump table in the
430 /// current function. The returned value is a member of the
431 /// MachineJumpTableInfo::JTEntryKind enum.
getJumpTableEncoding() const432 unsigned X86TargetLowering::getJumpTableEncoding() const {
433 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
434 // symbol.
435 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
436 return MachineJumpTableInfo::EK_Custom32;
437 if (isPositionIndependent() &&
438 getTargetMachine().getCodeModel() == CodeModel::Large &&
439 !Subtarget.isTargetCOFF())
440 return MachineJumpTableInfo::EK_LabelDifference64;
441
442 // Otherwise, use the normal jump table encoding heuristics.
443 return TargetLowering::getJumpTableEncoding();
444 }
445
useSoftFloat() const446 bool X86TargetLowering::useSoftFloat() const {
447 return Subtarget.useSoftFloat();
448 }
449
markLibCallAttributes(MachineFunction * MF,unsigned CC,ArgListTy & Args) const450 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
451 ArgListTy &Args) const {
452
453 // Only relabel X86-32 for C / Stdcall CCs.
454 if (Subtarget.is64Bit())
455 return;
456 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
457 return;
458 unsigned ParamRegs = 0;
459 if (auto *M = MF->getFunction().getParent())
460 ParamRegs = M->getNumberRegisterParameters();
461
462 // Mark the first N int arguments as having reg
463 for (auto &Arg : Args) {
464 Type *T = Arg.Ty;
465 if (T->isIntOrPtrTy())
466 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
467 unsigned numRegs = 1;
468 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
469 numRegs = 2;
470 if (ParamRegs < numRegs)
471 return;
472 ParamRegs -= numRegs;
473 Arg.IsInReg = true;
474 }
475 }
476 }
477
478 const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * MJTI,const MachineBasicBlock * MBB,unsigned uid,MCContext & Ctx) const479 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
480 const MachineBasicBlock *MBB,
481 unsigned uid,MCContext &Ctx) const{
482 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
483 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
484 // entries.
485 return MCSymbolRefExpr::create(MBB->getSymbol(), X86::S_GOTOFF, Ctx);
486 }
487
488 /// Returns relocation base for the given PIC jumptable.
getPICJumpTableRelocBase(SDValue Table,SelectionDAG & DAG) const489 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
490 SelectionDAG &DAG) const {
491 if (!Subtarget.is64Bit())
492 // This doesn't have SDLoc associated with it, but is not really the
493 // same as a Register.
494 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
495 getPointerTy(DAG.getDataLayout()));
496 return Table;
497 }
498
499 /// This returns the relocation base for the given PIC jumptable,
500 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
501 const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction * MF,unsigned JTI,MCContext & Ctx) const502 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
503 MCContext &Ctx) const {
504 // X86-64 uses RIP relative addressing based on the jump table label.
505 if (Subtarget.isPICStyleRIPRel() ||
506 (Subtarget.is64Bit() &&
507 getTargetMachine().getCodeModel() == CodeModel::Large))
508 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
509
510 // Otherwise, the reference is relative to the PIC base.
511 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
512 }
513
514 std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo * TRI,MVT VT) const515 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
516 MVT VT) const {
517 const TargetRegisterClass *RRC = nullptr;
518 uint8_t Cost = 1;
519 switch (VT.SimpleTy) {
520 default:
521 return TargetLowering::findRepresentativeClass(TRI, VT);
522 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
523 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
524 break;
525 case MVT::x86mmx:
526 RRC = &X86::VR64RegClass;
527 break;
528 case MVT::f32: case MVT::f64:
529 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
530 case MVT::v4f32: case MVT::v2f64:
531 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
532 case MVT::v8f32: case MVT::v4f64:
533 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
534 case MVT::v16f32: case MVT::v8f64:
535 RRC = &X86::VR128XRegClass;
536 break;
537 }
538 return std::make_pair(RRC, Cost);
539 }
540
getAddressSpace() const541 unsigned X86TargetLowering::getAddressSpace() const {
542 if (Subtarget.is64Bit())
543 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? X86AS::GS
544 : X86AS::FS;
545 return X86AS::GS;
546 }
547
hasStackGuardSlotTLS(const Triple & TargetTriple)548 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
549 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
550 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
551 }
552
SegmentOffset(IRBuilderBase & IRB,int Offset,unsigned AddressSpace)553 static Constant* SegmentOffset(IRBuilderBase &IRB,
554 int Offset, unsigned AddressSpace) {
555 return ConstantExpr::getIntToPtr(
556 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
557 IRB.getPtrTy(AddressSpace));
558 }
559
getIRStackGuard(IRBuilderBase & IRB) const560 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
561 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
562 // tcbhead_t; use it instead of the usual global variable (see
563 // sysdeps/{i386,x86_64}/nptl/tls.h)
564 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
565 unsigned AddressSpace = getAddressSpace();
566
567 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
568 if (Subtarget.isTargetFuchsia())
569 return SegmentOffset(IRB, 0x10, AddressSpace);
570
571 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
572 // Specially, some users may customize the base reg and offset.
573 int Offset = M->getStackProtectorGuardOffset();
574 // If we don't set -stack-protector-guard-offset value:
575 // %fs:0x28, unless we're using a Kernel code model, in which case
576 // it's %gs:0x28. gs:0x14 on i386.
577 if (Offset == INT_MAX)
578 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
579
580 StringRef GuardReg = M->getStackProtectorGuardReg();
581 if (GuardReg == "fs")
582 AddressSpace = X86AS::FS;
583 else if (GuardReg == "gs")
584 AddressSpace = X86AS::GS;
585
586 // Use symbol guard if user specify.
587 StringRef GuardSymb = M->getStackProtectorGuardSymbol();
588 if (!GuardSymb.empty()) {
589 GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
590 if (!GV) {
591 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
592 : Type::getInt32Ty(M->getContext());
593 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
594 nullptr, GuardSymb, nullptr,
595 GlobalValue::NotThreadLocal, AddressSpace);
596 if (!Subtarget.isTargetDarwin())
597 GV->setDSOLocal(M->getDirectAccessExternalData());
598 }
599 return GV;
600 }
601
602 return SegmentOffset(IRB, Offset, AddressSpace);
603 }
604 return TargetLowering::getIRStackGuard(IRB);
605 }
606
insertSSPDeclarations(Module & M) const607 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
608 // MSVC CRT provides functionalities for stack protection.
609 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
610 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
611 // MSVC CRT has a global variable holding security cookie.
612 M.getOrInsertGlobal("__security_cookie",
613 PointerType::getUnqual(M.getContext()));
614
615 // MSVC CRT has a function to validate security cookie.
616 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
617 "__security_check_cookie", Type::getVoidTy(M.getContext()),
618 PointerType::getUnqual(M.getContext()));
619 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
620 F->setCallingConv(CallingConv::X86_FastCall);
621 F->addParamAttr(0, Attribute::AttrKind::InReg);
622 }
623 return;
624 }
625
626 StringRef GuardMode = M.getStackProtectorGuard();
627
628 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
629 if ((GuardMode == "tls" || GuardMode.empty()) &&
630 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
631 return;
632 TargetLowering::insertSSPDeclarations(M);
633 }
634
getSDagStackGuard(const Module & M) const635 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
636 // MSVC CRT has a global variable holding security cookie.
637 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
638 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
639 return M.getGlobalVariable("__security_cookie");
640 }
641 return TargetLowering::getSDagStackGuard(M);
642 }
643
getSSPStackGuardCheck(const Module & M) const644 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
645 // MSVC CRT has a function to validate security cookie.
646 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
647 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
648 return M.getFunction("__security_check_cookie");
649 }
650 return TargetLowering::getSSPStackGuardCheck(M);
651 }
652
653 Value *
getSafeStackPointerLocation(IRBuilderBase & IRB) const654 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
655 // Android provides a fixed TLS slot for the SafeStack pointer. See the
656 // definition of TLS_SLOT_SAFESTACK in
657 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
658 if (Subtarget.isTargetAndroid()) {
659 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
660 // %gs:0x24 on i386
661 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
662 return SegmentOffset(IRB, Offset, getAddressSpace());
663 }
664
665 // Fuchsia is similar.
666 if (Subtarget.isTargetFuchsia()) {
667 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
668 return SegmentOffset(IRB, 0x18, getAddressSpace());
669 }
670
671 return TargetLowering::getSafeStackPointerLocation(IRB);
672 }
673
674 //===----------------------------------------------------------------------===//
675 // Return Value Calling Convention Implementation
676 //===----------------------------------------------------------------------===//
677
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context,const Type * RetTy) const678 bool X86TargetLowering::CanLowerReturn(
679 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
680 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
681 const Type *RetTy) const {
682 SmallVector<CCValAssign, 16> RVLocs;
683 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
684 return CCInfo.CheckReturn(Outs, RetCC_X86);
685 }
686
getScratchRegisters(CallingConv::ID) const687 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
688 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
689 return ScratchRegs;
690 }
691
getRoundingControlRegisters() const692 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
693 static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR};
694 return RCRegs;
695 }
696
697 /// Lowers masks values (v*i1) to the local register values
698 /// \returns DAG node after lowering to register type
lowerMasksToReg(const SDValue & ValArg,const EVT & ValLoc,const SDLoc & DL,SelectionDAG & DAG)699 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
700 const SDLoc &DL, SelectionDAG &DAG) {
701 EVT ValVT = ValArg.getValueType();
702
703 if (ValVT == MVT::v1i1)
704 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg,
705 DAG.getIntPtrConstant(0, DL));
706
707 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
708 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
709 // Two stage lowering might be required
710 // bitcast: v8i1 -> i8 / v16i1 -> i16
711 // anyextend: i8 -> i32 / i16 -> i32
712 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
713 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
714 if (ValLoc == MVT::i32)
715 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy);
716 return ValToCopy;
717 }
718
719 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
720 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
721 // One stage lowering is required
722 // bitcast: v32i1 -> i32 / v64i1 -> i64
723 return DAG.getBitcast(ValLoc, ValArg);
724 }
725
726 return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg);
727 }
728
729 /// Breaks v64i1 value into two registers and adds the new node to the DAG
Passv64i1ArgInRegs(const SDLoc & DL,SelectionDAG & DAG,SDValue & Arg,SmallVectorImpl<std::pair<Register,SDValue>> & RegsToPass,CCValAssign & VA,CCValAssign & NextVA,const X86Subtarget & Subtarget)730 static void Passv64i1ArgInRegs(
731 const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
732 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
733 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
734 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
735 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
736 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
737 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
738 "The value should reside in two registers");
739
740 // Before splitting the value we cast it to i64
741 Arg = DAG.getBitcast(MVT::i64, Arg);
742
743 // Splitting the value into two i32 types
744 SDValue Lo, Hi;
745 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32);
746
747 // Attach the two i32 types into corresponding registers
748 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
749 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
750 }
751
752 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & dl,SelectionDAG & DAG) const753 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
754 bool isVarArg,
755 const SmallVectorImpl<ISD::OutputArg> &Outs,
756 const SmallVectorImpl<SDValue> &OutVals,
757 const SDLoc &dl, SelectionDAG &DAG) const {
758 MachineFunction &MF = DAG.getMachineFunction();
759 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
760
761 // In some cases we need to disable registers from the default CSR list.
762 // For example, when they are used as return registers (preserve_* and X86's
763 // regcall) or for argument passing (X86's regcall).
764 bool ShouldDisableCalleeSavedRegister =
765 shouldDisableRetRegFromCSR(CallConv) ||
766 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
767
768 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
769 report_fatal_error("X86 interrupts may not return any value");
770
771 SmallVector<CCValAssign, 16> RVLocs;
772 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
773 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
774
775 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
776 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
777 ++I, ++OutsIndex) {
778 CCValAssign &VA = RVLocs[I];
779 assert(VA.isRegLoc() && "Can only return in registers!");
780
781 // Add the register to the CalleeSaveDisableRegs list.
782 if (ShouldDisableCalleeSavedRegister)
783 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
784
785 SDValue ValToCopy = OutVals[OutsIndex];
786 EVT ValVT = ValToCopy.getValueType();
787
788 // Promote values to the appropriate types.
789 if (VA.getLocInfo() == CCValAssign::SExt)
790 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
791 else if (VA.getLocInfo() == CCValAssign::ZExt)
792 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
793 else if (VA.getLocInfo() == CCValAssign::AExt) {
794 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
795 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
796 else
797 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
798 }
799 else if (VA.getLocInfo() == CCValAssign::BCvt)
800 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
801
802 assert(VA.getLocInfo() != CCValAssign::FPExt &&
803 "Unexpected FP-extend for return value.");
804
805 // Report an error if we have attempted to return a value via an XMM
806 // register and SSE was disabled.
807 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
808 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
809 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
810 } else if (!Subtarget.hasSSE2() &&
811 X86::FR64XRegClass.contains(VA.getLocReg()) &&
812 ValVT == MVT::f64) {
813 // When returning a double via an XMM register, report an error if SSE2 is
814 // not enabled.
815 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
816 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
817 }
818
819 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
820 // the RET instruction and handled by the FP Stackifier.
821 if (VA.getLocReg() == X86::FP0 ||
822 VA.getLocReg() == X86::FP1) {
823 // If this is a copy from an xmm register to ST(0), use an FPExtend to
824 // change the value to the FP stack register class.
825 if (isScalarFPTypeInSSEReg(VA.getValVT()))
826 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
827 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
828 // Don't emit a copytoreg.
829 continue;
830 }
831
832 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
833 // which is returned in RAX / RDX.
834 if (Subtarget.is64Bit()) {
835 if (ValVT == MVT::x86mmx) {
836 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
837 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
838 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
839 ValToCopy);
840 // If we don't have SSE2 available, convert to v4f32 so the generated
841 // register is legal.
842 if (!Subtarget.hasSSE2())
843 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
844 }
845 }
846 }
847
848 if (VA.needsCustom()) {
849 assert(VA.getValVT() == MVT::v64i1 &&
850 "Currently the only custom case is when we split v64i1 to 2 regs");
851
852 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
853 Subtarget);
854
855 // Add the second register to the CalleeSaveDisableRegs list.
856 if (ShouldDisableCalleeSavedRegister)
857 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
858 } else {
859 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
860 }
861 }
862
863 SDValue Glue;
864 SmallVector<SDValue, 6> RetOps;
865 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
866 // Operand #1 = Bytes To Pop
867 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
868 MVT::i32));
869
870 // Copy the result values into the output registers.
871 for (auto &RetVal : RetVals) {
872 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
873 RetOps.push_back(RetVal.second);
874 continue; // Don't emit a copytoreg.
875 }
876
877 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
878 Glue = Chain.getValue(1);
879 RetOps.push_back(
880 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
881 }
882
883 // Swift calling convention does not require we copy the sret argument
884 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
885
886 // All x86 ABIs require that for returning structs by value we copy
887 // the sret argument into %rax/%eax (depending on ABI) for the return.
888 // We saved the argument into a virtual register in the entry block,
889 // so now we copy the value out and into %rax/%eax.
890 //
891 // Checking Function.hasStructRetAttr() here is insufficient because the IR
892 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
893 // false, then an sret argument may be implicitly inserted in the SelDAG. In
894 // either case FuncInfo->setSRetReturnReg() will have been called.
895 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
896 // When we have both sret and another return value, we should use the
897 // original Chain stored in RetOps[0], instead of the current Chain updated
898 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
899
900 // For the case of sret and another return value, we have
901 // Chain_0 at the function entry
902 // Chain_1 = getCopyToReg(Chain_0) in the above loop
903 // If we use Chain_1 in getCopyFromReg, we will have
904 // Val = getCopyFromReg(Chain_1)
905 // Chain_2 = getCopyToReg(Chain_1, Val) from below
906
907 // getCopyToReg(Chain_0) will be glued together with
908 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
909 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
910 // Data dependency from Unit B to Unit A due to usage of Val in
911 // getCopyToReg(Chain_1, Val)
912 // Chain dependency from Unit A to Unit B
913
914 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
915 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
916 getPointerTy(MF.getDataLayout()));
917
918 Register RetValReg
919 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
920 X86::RAX : X86::EAX;
921 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
922 Glue = Chain.getValue(1);
923
924 // RAX/EAX now acts like a return value.
925 RetOps.push_back(
926 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
927
928 // Add the returned register to the CalleeSaveDisableRegs list. Don't do
929 // this however for preserve_most/preserve_all to minimize the number of
930 // callee-saved registers for these CCs.
931 if (ShouldDisableCalleeSavedRegister &&
932 CallConv != CallingConv::PreserveAll &&
933 CallConv != CallingConv::PreserveMost)
934 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
935 }
936
937 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
938 const MCPhysReg *I =
939 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
940 if (I) {
941 for (; *I; ++I) {
942 if (X86::GR64RegClass.contains(*I))
943 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
944 else
945 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
946 }
947 }
948
949 RetOps[0] = Chain; // Update chain.
950
951 // Add the glue if we have it.
952 if (Glue.getNode())
953 RetOps.push_back(Glue);
954
955 X86ISD::NodeType opcode = X86ISD::RET_GLUE;
956 if (CallConv == CallingConv::X86_INTR)
957 opcode = X86ISD::IRET;
958 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
959 }
960
isUsedByReturnOnly(SDNode * N,SDValue & Chain) const961 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
962 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
963 return false;
964
965 SDValue TCChain = Chain;
966 SDNode *Copy = *N->user_begin();
967 if (Copy->getOpcode() == ISD::CopyToReg) {
968 // If the copy has a glue operand, we conservatively assume it isn't safe to
969 // perform a tail call.
970 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
971 return false;
972 TCChain = Copy->getOperand(0);
973 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
974 return false;
975
976 bool HasRet = false;
977 for (const SDNode *U : Copy->users()) {
978 if (U->getOpcode() != X86ISD::RET_GLUE)
979 return false;
980 // If we are returning more than one value, we can definitely
981 // not make a tail call see PR19530
982 if (U->getNumOperands() > 4)
983 return false;
984 if (U->getNumOperands() == 4 &&
985 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
986 return false;
987 HasRet = true;
988 }
989
990 if (!HasRet)
991 return false;
992
993 Chain = TCChain;
994 return true;
995 }
996
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const997 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
998 ISD::NodeType ExtendKind) const {
999 MVT ReturnMVT = MVT::i32;
1000
1001 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
1002 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
1003 // The ABI does not require i1, i8 or i16 to be extended.
1004 //
1005 // On Darwin, there is code in the wild relying on Clang's old behaviour of
1006 // always extending i8/i16 return values, so keep doing that for now.
1007 // (PR26665).
1008 ReturnMVT = MVT::i8;
1009 }
1010
1011 EVT MinVT = getRegisterType(Context, ReturnMVT);
1012 return VT.bitsLT(MinVT) ? MinVT : VT;
1013 }
1014
1015 /// Reads two 32 bit registers and creates a 64 bit mask value.
1016 /// \param VA The current 32 bit value that need to be assigned.
1017 /// \param NextVA The next 32 bit value that need to be assigned.
1018 /// \param Root The parent DAG node.
1019 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1020 /// glue purposes. In the case the DAG is already using
1021 /// physical register instead of virtual, we should glue
1022 /// our new SDValue to InGlue SDvalue.
1023 /// \return a new SDvalue of size 64bit.
getv64i1Argument(CCValAssign & VA,CCValAssign & NextVA,SDValue & Root,SelectionDAG & DAG,const SDLoc & DL,const X86Subtarget & Subtarget,SDValue * InGlue=nullptr)1024 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1025 SDValue &Root, SelectionDAG &DAG,
1026 const SDLoc &DL, const X86Subtarget &Subtarget,
1027 SDValue *InGlue = nullptr) {
1028 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1029 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1030 assert(VA.getValVT() == MVT::v64i1 &&
1031 "Expecting first location of 64 bit width type");
1032 assert(NextVA.getValVT() == VA.getValVT() &&
1033 "The locations should have the same type");
1034 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1035 "The values should reside in two registers");
1036
1037 SDValue Lo, Hi;
1038 SDValue ArgValueLo, ArgValueHi;
1039
1040 MachineFunction &MF = DAG.getMachineFunction();
1041 const TargetRegisterClass *RC = &X86::GR32RegClass;
1042
1043 // Read a 32 bit value from the registers.
1044 if (nullptr == InGlue) {
1045 // When no physical register is present,
1046 // create an intermediate virtual register.
1047 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1048 ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1049 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
1050 ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32);
1051 } else {
1052 // When a physical register is available read the value from it and glue
1053 // the reads together.
1054 ArgValueLo =
1055 DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue);
1056 *InGlue = ArgValueLo.getValue(2);
1057 ArgValueHi =
1058 DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue);
1059 *InGlue = ArgValueHi.getValue(2);
1060 }
1061
1062 // Convert the i32 type into v32i1 type.
1063 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
1064
1065 // Convert the i32 type into v32i1 type.
1066 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
1067
1068 // Concatenate the two values together.
1069 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi);
1070 }
1071
1072 /// The function will lower a register of various sizes (8/16/32/64)
1073 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1074 /// \returns a DAG node contains the operand after lowering to mask type.
lowerRegToMasks(const SDValue & ValArg,const EVT & ValVT,const EVT & ValLoc,const SDLoc & DL,SelectionDAG & DAG)1075 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1076 const EVT &ValLoc, const SDLoc &DL,
1077 SelectionDAG &DAG) {
1078 SDValue ValReturned = ValArg;
1079
1080 if (ValVT == MVT::v1i1)
1081 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned);
1082
1083 if (ValVT == MVT::v64i1) {
1084 // In 32 bit machine, this case is handled by getv64i1Argument
1085 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1086 // In 64 bit machine, There is no need to truncate the value only bitcast
1087 } else {
1088 MVT MaskLenVT;
1089 switch (ValVT.getSimpleVT().SimpleTy) {
1090 case MVT::v8i1:
1091 MaskLenVT = MVT::i8;
1092 break;
1093 case MVT::v16i1:
1094 MaskLenVT = MVT::i16;
1095 break;
1096 case MVT::v32i1:
1097 MaskLenVT = MVT::i32;
1098 break;
1099 default:
1100 llvm_unreachable("Expecting a vector of i1 types");
1101 }
1102
1103 ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned);
1104 }
1105 return DAG.getBitcast(ValVT, ValReturned);
1106 }
1107
getPopFromX87Reg(SelectionDAG & DAG,SDValue Chain,const SDLoc & dl,Register Reg,EVT VT,SDValue Glue)1108 static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
1109 const SDLoc &dl, Register Reg, EVT VT,
1110 SDValue Glue) {
1111 SDVTList VTs = DAG.getVTList(VT, MVT::Other, MVT::Glue);
1112 SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
1113 return DAG.getNode(X86ISD::POP_FROM_X87_REG, dl, VTs,
1114 ArrayRef(Ops, Glue.getNode() ? 3 : 2));
1115 }
1116
1117 /// Lower the result values of a call into the
1118 /// appropriate copies out of appropriate physical registers.
1119 ///
LowerCallResult(SDValue Chain,SDValue InGlue,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals,uint32_t * RegMask) const1120 SDValue X86TargetLowering::LowerCallResult(
1121 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1122 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1123 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1124 uint32_t *RegMask) const {
1125
1126 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1127 // Assign locations to each value returned by this call.
1128 SmallVector<CCValAssign, 16> RVLocs;
1129 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1130 *DAG.getContext());
1131 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1132
1133 // Copy all of the result registers out of their specified physreg.
1134 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
1135 ++I, ++InsIndex) {
1136 CCValAssign &VA = RVLocs[I];
1137 EVT CopyVT = VA.getLocVT();
1138
1139 // In some calling conventions we need to remove the used registers
1140 // from the register mask.
1141 if (RegMask) {
1142 for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
1143 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
1144 }
1145
1146 // Report an error if there was an attempt to return FP values via XMM
1147 // registers.
1148 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
1149 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
1150 if (VA.getLocReg() == X86::XMM1)
1151 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1152 else
1153 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1154 } else if (!Subtarget.hasSSE2() &&
1155 X86::FR64XRegClass.contains(VA.getLocReg()) &&
1156 CopyVT == MVT::f64) {
1157 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
1158 if (VA.getLocReg() == X86::XMM1)
1159 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
1160 else
1161 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
1162 }
1163
1164 // If we prefer to use the value in xmm registers, copy it out as f80 and
1165 // use a truncate to move it from fp stack reg to xmm reg.
1166 bool RoundAfterCopy = false;
1167 bool X87Result = VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1;
1168 if (X87Result && isScalarFPTypeInSSEReg(VA.getValVT())) {
1169 if (!Subtarget.hasX87())
1170 report_fatal_error("X87 register return with X87 disabled");
1171 CopyVT = MVT::f80;
1172 RoundAfterCopy = (CopyVT != VA.getLocVT());
1173 }
1174
1175 SDValue Val;
1176 if (VA.needsCustom()) {
1177 assert(VA.getValVT() == MVT::v64i1 &&
1178 "Currently the only custom case is when we split v64i1 to 2 regs");
1179 Val =
1180 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
1181 } else {
1182 Chain =
1183 X87Result
1184 ? getPopFromX87Reg(DAG, Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1185 .getValue(1)
1186 : DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
1187 .getValue(1);
1188 Val = Chain.getValue(0);
1189 InGlue = Chain.getValue(2);
1190 }
1191
1192 if (RoundAfterCopy)
1193 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1194 // This truncation won't change the value.
1195 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
1196
1197 if (VA.isExtInLoc()) {
1198 if (VA.getValVT().isVector() &&
1199 VA.getValVT().getScalarType() == MVT::i1 &&
1200 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1201 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1202 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1203 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
1204 } else
1205 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1206 }
1207
1208 if (VA.getLocInfo() == CCValAssign::BCvt)
1209 Val = DAG.getBitcast(VA.getValVT(), Val);
1210
1211 InVals.push_back(Val);
1212 }
1213
1214 return Chain;
1215 }
1216
1217 //===----------------------------------------------------------------------===//
1218 // C & StdCall & Fast Calling Convention implementation
1219 //===----------------------------------------------------------------------===//
1220 // StdCall calling convention seems to be standard for many Windows' API
1221 // routines and around. It differs from C calling convention just a little:
1222 // callee should clean up the stack, not caller. Symbols should be also
1223 // decorated in some fancy way :) It doesn't support any vector arguments.
1224 // For info on fast calling convention see Fast Calling Convention (tail call)
1225 // implementation LowerX86_32FastCCCallTo.
1226
1227 /// Determines whether Args, either a set of outgoing arguments to a call, or a
1228 /// set of incoming args of a call, contains an sret pointer that the callee
1229 /// pops
1230 template <typename T>
hasCalleePopSRet(const SmallVectorImpl<T> & Args,const X86Subtarget & Subtarget)1231 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1232 const X86Subtarget &Subtarget) {
1233 // Not C++20 (yet), so no concepts available.
1234 static_assert(std::is_same_v<T, ISD::OutputArg> ||
1235 std::is_same_v<T, ISD::InputArg>,
1236 "requires ISD::OutputArg or ISD::InputArg");
1237
1238 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
1239 // for most compilations.
1240 if (!Subtarget.is32Bit())
1241 return false;
1242
1243 if (Args.empty())
1244 return false;
1245
1246 // Most calls do not have an sret argument, check the arg next.
1247 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
1248 if (!Flags.isSRet() || Flags.isInReg())
1249 return false;
1250
1251 // The MSVCabi does not pop the sret.
1252 if (Subtarget.getTargetTriple().isOSMSVCRT())
1253 return false;
1254
1255 // MCUs don't pop the sret
1256 if (Subtarget.isTargetMCU())
1257 return false;
1258
1259 // Callee pops argument
1260 return true;
1261 }
1262
1263 /// Make a copy of an aggregate at address specified by "Src" to address
1264 /// "Dst" with size and alignment information specified by the specific
1265 /// parameter attribute. The copy will be passed as a byval function parameter.
CreateCopyOfByValArgument(SDValue Src,SDValue Dst,SDValue Chain,ISD::ArgFlagsTy Flags,SelectionDAG & DAG,const SDLoc & dl)1266 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1267 SDValue Chain, ISD::ArgFlagsTy Flags,
1268 SelectionDAG &DAG, const SDLoc &dl) {
1269 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
1270
1271 return DAG.getMemcpy(
1272 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
1273 /*isVolatile*/ false, /*AlwaysInline=*/true,
1274 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo());
1275 }
1276
1277 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)1278 static bool canGuaranteeTCO(CallingConv::ID CC) {
1279 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1280 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
1281 CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
1282 }
1283
1284 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)1285 static bool mayTailCallThisCC(CallingConv::ID CC) {
1286 switch (CC) {
1287 // C calling conventions:
1288 case CallingConv::C:
1289 case CallingConv::Win64:
1290 case CallingConv::X86_64_SysV:
1291 case CallingConv::PreserveNone:
1292 // Callee pop conventions:
1293 case CallingConv::X86_ThisCall:
1294 case CallingConv::X86_StdCall:
1295 case CallingConv::X86_VectorCall:
1296 case CallingConv::X86_FastCall:
1297 // Swift:
1298 case CallingConv::Swift:
1299 return true;
1300 default:
1301 return canGuaranteeTCO(CC);
1302 }
1303 }
1304
1305 /// Return true if the function is being made into a tailcall target by
1306 /// changing its ABI.
shouldGuaranteeTCO(CallingConv::ID CC,bool GuaranteedTailCallOpt)1307 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1308 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
1309 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
1310 }
1311
mayBeEmittedAsTailCall(const CallInst * CI) const1312 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1313 if (!CI->isTailCall())
1314 return false;
1315
1316 CallingConv::ID CalleeCC = CI->getCallingConv();
1317 if (!mayTailCallThisCC(CalleeCC))
1318 return false;
1319
1320 return true;
1321 }
1322
1323 SDValue
LowerMemArgument(SDValue Chain,CallingConv::ID CallConv,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,MachineFrameInfo & MFI,unsigned i) const1324 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1325 const SmallVectorImpl<ISD::InputArg> &Ins,
1326 const SDLoc &dl, SelectionDAG &DAG,
1327 const CCValAssign &VA,
1328 MachineFrameInfo &MFI, unsigned i) const {
1329 // Create the nodes corresponding to a load from this parameter slot.
1330 ISD::ArgFlagsTy Flags = Ins[i].Flags;
1331 bool AlwaysUseMutable = shouldGuaranteeTCO(
1332 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
1333 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1334 EVT ValVT;
1335 MVT PtrVT = getPointerTy(DAG.getDataLayout());
1336
1337 // If value is passed by pointer we have address passed instead of the value
1338 // itself. No need to extend if the mask value and location share the same
1339 // absolute size.
1340 bool ExtendedInMem =
1341 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1342 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1343
1344 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
1345 ValVT = VA.getLocVT();
1346 else
1347 ValVT = VA.getValVT();
1348
1349 // FIXME: For now, all byval parameter objects are marked mutable. This can be
1350 // changed with more analysis.
1351 // In case of tail call optimization mark all arguments mutable. Since they
1352 // could be overwritten by lowering of arguments in case of a tail call.
1353 if (Flags.isByVal()) {
1354 unsigned Bytes = Flags.getByValSize();
1355 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1356
1357 // FIXME: For now, all byval parameter objects are marked as aliasing. This
1358 // can be improved with deeper analysis.
1359 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
1360 /*isAliased=*/true);
1361 return DAG.getFrameIndex(FI, PtrVT);
1362 }
1363
1364 EVT ArgVT = Ins[i].ArgVT;
1365
1366 // If this is a vector that has been split into multiple parts, don't elide
1367 // the copy. The layout on the stack may not match the packed in-memory
1368 // layout.
1369 bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1370
1371 // This is an argument in memory. We might be able to perform copy elision.
1372 // If the argument is passed directly in memory without any extension, then we
1373 // can perform copy elision. Large vector types, for example, may be passed
1374 // indirectly by pointer.
1375 if (Flags.isCopyElisionCandidate() &&
1376 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1377 !ScalarizedVector) {
1378 SDValue PartAddr;
1379 if (Ins[i].PartOffset == 0) {
1380 // If this is a one-part value or the first part of a multi-part value,
1381 // create a stack object for the entire argument value type and return a
1382 // load from our portion of it. This assumes that if the first part of an
1383 // argument is in memory, the rest will also be in memory.
1384 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
1385 /*IsImmutable=*/false);
1386 PartAddr = DAG.getFrameIndex(FI, PtrVT);
1387 return DAG.getLoad(
1388 ValVT, dl, Chain, PartAddr,
1389 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
1390 }
1391
1392 // This is not the first piece of an argument in memory. See if there is
1393 // already a fixed stack object including this offset. If so, assume it
1394 // was created by the PartOffset == 0 branch above and create a load from
1395 // the appropriate offset into it.
1396 int64_t PartBegin = VA.getLocMemOffset();
1397 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
1398 int FI = MFI.getObjectIndexBegin();
1399 for (; MFI.isFixedObjectIndex(FI); ++FI) {
1400 int64_t ObjBegin = MFI.getObjectOffset(FI);
1401 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
1402 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1403 break;
1404 }
1405 if (MFI.isFixedObjectIndex(FI)) {
1406 SDValue Addr =
1407 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
1408 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
1409 return DAG.getLoad(ValVT, dl, Chain, Addr,
1410 MachinePointerInfo::getFixedStack(
1411 DAG.getMachineFunction(), FI, Ins[i].PartOffset));
1412 }
1413 }
1414
1415 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
1416 VA.getLocMemOffset(), isImmutable);
1417
1418 // Set SExt or ZExt flag.
1419 if (VA.getLocInfo() == CCValAssign::ZExt) {
1420 MFI.setObjectZExt(FI, true);
1421 } else if (VA.getLocInfo() == CCValAssign::SExt) {
1422 MFI.setObjectSExt(FI, true);
1423 }
1424
1425 MaybeAlign Alignment;
1426 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1427 ValVT != MVT::f80)
1428 Alignment = MaybeAlign(4);
1429 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1430 SDValue Val = DAG.getLoad(
1431 ValVT, dl, Chain, FIN,
1432 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1433 Alignment);
1434 return ExtendedInMem
1435 ? (VA.getValVT().isVector()
1436 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
1437 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
1438 : Val;
1439 }
1440
1441 // FIXME: Get this from tablegen.
get64BitArgumentGPRs(CallingConv::ID CallConv,const X86Subtarget & Subtarget)1442 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1443 const X86Subtarget &Subtarget) {
1444 assert(Subtarget.is64Bit());
1445
1446 if (Subtarget.isCallingConvWin64(CallConv)) {
1447 static const MCPhysReg GPR64ArgRegsWin64[] = {
1448 X86::RCX, X86::RDX, X86::R8, X86::R9
1449 };
1450 return GPR64ArgRegsWin64;
1451 }
1452
1453 static const MCPhysReg GPR64ArgRegs64Bit[] = {
1454 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1455 };
1456 return GPR64ArgRegs64Bit;
1457 }
1458
1459 // FIXME: Get this from tablegen.
get64BitArgumentXMMs(MachineFunction & MF,CallingConv::ID CallConv,const X86Subtarget & Subtarget)1460 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1461 CallingConv::ID CallConv,
1462 const X86Subtarget &Subtarget) {
1463 assert(Subtarget.is64Bit());
1464 if (Subtarget.isCallingConvWin64(CallConv)) {
1465 // The XMM registers which might contain var arg parameters are shadowed
1466 // in their paired GPR. So we only need to save the GPR to their home
1467 // slots.
1468 // TODO: __vectorcall will change this.
1469 return {};
1470 }
1471
1472 bool isSoftFloat = Subtarget.useSoftFloat();
1473 if (isSoftFloat || !Subtarget.hasSSE1())
1474 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
1475 // registers.
1476 return {};
1477
1478 static const MCPhysReg XMMArgRegs64Bit[] = {
1479 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1480 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1481 };
1482 return XMMArgRegs64Bit;
1483 }
1484
1485 #ifndef NDEBUG
isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs)1486 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1487 return llvm::is_sorted(
1488 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1489 return A.getValNo() < B.getValNo();
1490 });
1491 }
1492 #endif
1493
1494 namespace {
1495 /// This is a helper class for lowering variable arguments parameters.
1496 class VarArgsLoweringHelper {
1497 public:
VarArgsLoweringHelper(X86MachineFunctionInfo * FuncInfo,const SDLoc & Loc,SelectionDAG & DAG,const X86Subtarget & Subtarget,CallingConv::ID CallConv,CCState & CCInfo)1498 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
1499 SelectionDAG &DAG, const X86Subtarget &Subtarget,
1500 CallingConv::ID CallConv, CCState &CCInfo)
1501 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1502 TheMachineFunction(DAG.getMachineFunction()),
1503 TheFunction(TheMachineFunction.getFunction()),
1504 FrameInfo(TheMachineFunction.getFrameInfo()),
1505 FrameLowering(*Subtarget.getFrameLowering()),
1506 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1507 CCInfo(CCInfo) {}
1508
1509 // Lower variable arguments parameters.
1510 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1511
1512 private:
1513 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1514
1515 void forwardMustTailParameters(SDValue &Chain);
1516
is64Bit() const1517 bool is64Bit() const { return Subtarget.is64Bit(); }
isWin64() const1518 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
1519
1520 X86MachineFunctionInfo *FuncInfo;
1521 const SDLoc &DL;
1522 SelectionDAG &DAG;
1523 const X86Subtarget &Subtarget;
1524 MachineFunction &TheMachineFunction;
1525 const Function &TheFunction;
1526 MachineFrameInfo &FrameInfo;
1527 const TargetFrameLowering &FrameLowering;
1528 const TargetLowering &TargLowering;
1529 CallingConv::ID CallConv;
1530 CCState &CCInfo;
1531 };
1532 } // namespace
1533
createVarArgAreaAndStoreRegisters(SDValue & Chain,unsigned StackSize)1534 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1535 SDValue &Chain, unsigned StackSize) {
1536 // If the function takes variable number of arguments, make a frame index for
1537 // the start of the first vararg value... for expansion of llvm.va_start. We
1538 // can skip this if there are no va_start calls.
1539 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
1540 CallConv != CallingConv::X86_ThisCall)) {
1541 FuncInfo->setVarArgsFrameIndex(
1542 FrameInfo.CreateFixedObject(1, StackSize, true));
1543 }
1544
1545 // 64-bit calling conventions support varargs and register parameters, so we
1546 // have to do extra work to spill them in the prologue.
1547 if (is64Bit()) {
1548 // Find the first unallocated argument registers.
1549 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1550 ArrayRef<MCPhysReg> ArgXMMs =
1551 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
1552 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
1553 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
1554
1555 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1556 "SSE register cannot be used when SSE is disabled!");
1557
1558 if (isWin64()) {
1559 // Get to the caller-allocated home save location. Add 8 to account
1560 // for the return address.
1561 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
1562 FuncInfo->setRegSaveFrameIndex(
1563 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
1564 // Fixup to set vararg frame on shadow area (4 x i64).
1565 if (NumIntRegs < 4)
1566 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1567 } else {
1568 // For X86-64, if there are vararg parameters that are passed via
1569 // registers, then we must store them to their spots on the stack so
1570 // they may be loaded by dereferencing the result of va_next.
1571 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1572 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
1573 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1574 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
1575 }
1576
1577 SmallVector<SDValue, 6>
1578 LiveGPRs; // list of SDValue for GPR registers keeping live input value
1579 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
1580 // keeping live input value
1581 SDValue ALVal; // if applicable keeps SDValue for %al register
1582
1583 // Gather all the live in physical registers.
1584 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
1585 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
1586 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
1587 }
1588 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
1589 if (!AvailableXmms.empty()) {
1590 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1591 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
1592 for (MCPhysReg Reg : AvailableXmms) {
1593 // FastRegisterAllocator spills virtual registers at basic
1594 // block boundary. That leads to usages of xmm registers
1595 // outside of check for %al. Pass physical registers to
1596 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1597 TheMachineFunction.getRegInfo().addLiveIn(Reg);
1598 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
1599 }
1600 }
1601
1602 // Store the integer parameter registers.
1603 SmallVector<SDValue, 8> MemOps;
1604 SDValue RSFIN =
1605 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1606 TargLowering.getPointerTy(DAG.getDataLayout()));
1607 unsigned Offset = FuncInfo->getVarArgsGPOffset();
1608 for (SDValue Val : LiveGPRs) {
1609 SDValue FIN = DAG.getNode(ISD::ADD, DL,
1610 TargLowering.getPointerTy(DAG.getDataLayout()),
1611 RSFIN, DAG.getIntPtrConstant(Offset, DL));
1612 SDValue Store =
1613 DAG.getStore(Val.getValue(1), DL, Val, FIN,
1614 MachinePointerInfo::getFixedStack(
1615 DAG.getMachineFunction(),
1616 FuncInfo->getRegSaveFrameIndex(), Offset));
1617 MemOps.push_back(Store);
1618 Offset += 8;
1619 }
1620
1621 // Now store the XMM (fp + vector) parameter registers.
1622 if (!LiveXMMRegs.empty()) {
1623 SmallVector<SDValue, 12> SaveXMMOps;
1624 SaveXMMOps.push_back(Chain);
1625 SaveXMMOps.push_back(ALVal);
1626 SaveXMMOps.push_back(RSFIN);
1627 SaveXMMOps.push_back(
1628 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
1629 llvm::append_range(SaveXMMOps, LiveXMMRegs);
1630 MachineMemOperand *StoreMMO =
1631 DAG.getMachineFunction().getMachineMemOperand(
1632 MachinePointerInfo::getFixedStack(
1633 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
1634 Offset),
1635 MachineMemOperand::MOStore, 128, Align(16));
1636 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
1637 DL, DAG.getVTList(MVT::Other),
1638 SaveXMMOps, MVT::i8, StoreMMO));
1639 }
1640
1641 if (!MemOps.empty())
1642 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
1643 }
1644 }
1645
forwardMustTailParameters(SDValue & Chain)1646 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1647 // Find the largest legal vector type.
1648 MVT VecVT = MVT::Other;
1649 // FIXME: Only some x86_32 calling conventions support AVX512.
1650 if (Subtarget.useAVX512Regs() &&
1651 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
1652 CallConv == CallingConv::Intel_OCL_BI)))
1653 VecVT = MVT::v16f32;
1654 else if (Subtarget.hasAVX())
1655 VecVT = MVT::v8f32;
1656 else if (Subtarget.hasSSE2())
1657 VecVT = MVT::v4f32;
1658
1659 // We forward some GPRs and some vector types.
1660 SmallVector<MVT, 2> RegParmTypes;
1661 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1662 RegParmTypes.push_back(IntVT);
1663 if (VecVT != MVT::Other)
1664 RegParmTypes.push_back(VecVT);
1665
1666 // Compute the set of forwarded registers. The rest are scratch.
1667 SmallVectorImpl<ForwardedRegister> &Forwards =
1668 FuncInfo->getForwardedMustTailRegParms();
1669 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
1670
1671 // Forward AL for SysV x86_64 targets, since it is used for varargs.
1672 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
1673 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
1674 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
1675 }
1676
1677 // Copy all forwards from physical to virtual registers.
1678 for (ForwardedRegister &FR : Forwards) {
1679 // FIXME: Can we use a less constrained schedule?
1680 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
1681 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1682 TargLowering.getRegClassFor(FR.VT));
1683 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
1684 }
1685 }
1686
lowerVarArgsParameters(SDValue & Chain,unsigned StackSize)1687 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1688 unsigned StackSize) {
1689 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1690 // If necessary, it would be set into the correct value later.
1691 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1692 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1693
1694 if (FrameInfo.hasVAStart())
1695 createVarArgAreaAndStoreRegisters(Chain, StackSize);
1696
1697 if (FrameInfo.hasMustTailInVarArgFunc())
1698 forwardMustTailParameters(Chain);
1699 }
1700
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool IsVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const1701 SDValue X86TargetLowering::LowerFormalArguments(
1702 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1703 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1704 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1705 MachineFunction &MF = DAG.getMachineFunction();
1706 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1707
1708 const Function &F = MF.getFunction();
1709 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1710 F.getName() == "main")
1711 FuncInfo->setForceFramePointer(true);
1712
1713 MachineFrameInfo &MFI = MF.getFrameInfo();
1714 bool Is64Bit = Subtarget.is64Bit();
1715 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
1716
1717 assert(
1718 !(IsVarArg && canGuaranteeTCO(CallConv)) &&
1719 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1720
1721 // Assign locations to all of the incoming arguments.
1722 SmallVector<CCValAssign, 16> ArgLocs;
1723 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1724
1725 // Allocate shadow area for Win64.
1726 if (IsWin64)
1727 CCInfo.AllocateStack(32, Align(8));
1728
1729 CCInfo.AnalyzeArguments(Ins, CC_X86);
1730
1731 // In vectorcall calling convention a second pass is required for the HVA
1732 // types.
1733 if (CallingConv::X86_VectorCall == CallConv) {
1734 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
1735 }
1736
1737 // The next loop assumes that the locations are in the same order of the
1738 // input arguments.
1739 assert(isSortedByValueNo(ArgLocs) &&
1740 "Argument Location list must be sorted before lowering");
1741
1742 SDValue ArgValue;
1743 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
1744 ++I, ++InsIndex) {
1745 assert(InsIndex < Ins.size() && "Invalid Ins index");
1746 CCValAssign &VA = ArgLocs[I];
1747
1748 if (VA.isRegLoc()) {
1749 EVT RegVT = VA.getLocVT();
1750 if (VA.needsCustom()) {
1751 assert(
1752 VA.getValVT() == MVT::v64i1 &&
1753 "Currently the only custom case is when we split v64i1 to 2 regs");
1754
1755 // v64i1 values, in regcall calling convention, that are
1756 // compiled to 32 bit arch, are split up into two registers.
1757 ArgValue =
1758 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
1759 } else {
1760 const TargetRegisterClass *RC;
1761 if (RegVT == MVT::i8)
1762 RC = &X86::GR8RegClass;
1763 else if (RegVT == MVT::i16)
1764 RC = &X86::GR16RegClass;
1765 else if (RegVT == MVT::i32)
1766 RC = &X86::GR32RegClass;
1767 else if (Is64Bit && RegVT == MVT::i64)
1768 RC = &X86::GR64RegClass;
1769 else if (RegVT == MVT::f16)
1770 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1771 else if (RegVT == MVT::f32)
1772 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1773 else if (RegVT == MVT::f64)
1774 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1775 else if (RegVT == MVT::f80)
1776 RC = &X86::RFP80RegClass;
1777 else if (RegVT == MVT::f128)
1778 RC = &X86::VR128RegClass;
1779 else if (RegVT.is512BitVector())
1780 RC = &X86::VR512RegClass;
1781 else if (RegVT.is256BitVector())
1782 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1783 else if (RegVT.is128BitVector())
1784 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1785 else if (RegVT == MVT::x86mmx)
1786 RC = &X86::VR64RegClass;
1787 else if (RegVT == MVT::v1i1)
1788 RC = &X86::VK1RegClass;
1789 else if (RegVT == MVT::v8i1)
1790 RC = &X86::VK8RegClass;
1791 else if (RegVT == MVT::v16i1)
1792 RC = &X86::VK16RegClass;
1793 else if (RegVT == MVT::v32i1)
1794 RC = &X86::VK32RegClass;
1795 else if (RegVT == MVT::v64i1)
1796 RC = &X86::VK64RegClass;
1797 else
1798 llvm_unreachable("Unknown argument type!");
1799
1800 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
1801 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1802 }
1803
1804 // If this is an 8 or 16-bit value, it is really passed promoted to 32
1805 // bits. Insert an assert[sz]ext to capture this, then truncate to the
1806 // right size.
1807 if (VA.getLocInfo() == CCValAssign::SExt)
1808 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1809 DAG.getValueType(VA.getValVT()));
1810 else if (VA.getLocInfo() == CCValAssign::ZExt)
1811 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1812 DAG.getValueType(VA.getValVT()));
1813 else if (VA.getLocInfo() == CCValAssign::BCvt)
1814 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
1815
1816 if (VA.isExtInLoc()) {
1817 // Handle MMX values passed in XMM regs.
1818 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1819 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
1820 else if (VA.getValVT().isVector() &&
1821 VA.getValVT().getScalarType() == MVT::i1 &&
1822 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1823 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1824 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1825 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
1826 } else
1827 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1828 }
1829 } else {
1830 assert(VA.isMemLoc());
1831 ArgValue =
1832 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
1833 }
1834
1835 // If value is passed via pointer - do a load.
1836 if (VA.getLocInfo() == CCValAssign::Indirect &&
1837 !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
1838 ArgValue =
1839 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
1840 }
1841
1842 InVals.push_back(ArgValue);
1843 }
1844
1845 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
1846 if (Ins[I].Flags.isSwiftAsync()) {
1847 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1848 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF))
1849 X86FI->setHasSwiftAsyncContext(true);
1850 else {
1851 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
1852 int FI =
1853 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize), false);
1854 X86FI->setSwiftAsyncContextFrameIdx(FI);
1855 SDValue St = DAG.getStore(
1856 DAG.getEntryNode(), dl, InVals[I],
1857 DAG.getFrameIndex(FI, PtrSize == 8 ? MVT::i64 : MVT::i32),
1858 MachinePointerInfo::getFixedStack(MF, FI));
1859 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
1860 }
1861 }
1862
1863 // Swift calling convention does not require we copy the sret argument
1864 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1865 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
1866 continue;
1867
1868 // All x86 ABIs require that for returning structs by value we copy the
1869 // sret argument into %rax/%eax (depending on ABI) for the return. Save
1870 // the argument into a virtual register so that we can access it from the
1871 // return points.
1872 if (Ins[I].Flags.isSRet()) {
1873 assert(!FuncInfo->getSRetReturnReg() &&
1874 "SRet return has already been set");
1875 MVT PtrTy = getPointerTy(DAG.getDataLayout());
1876 Register Reg =
1877 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
1878 FuncInfo->setSRetReturnReg(Reg);
1879 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
1880 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1881 break;
1882 }
1883 }
1884
1885 unsigned StackSize = CCInfo.getStackSize();
1886 // Align stack specially for tail calls.
1887 if (shouldGuaranteeTCO(CallConv,
1888 MF.getTarget().Options.GuaranteedTailCallOpt))
1889 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1890
1891 if (IsVarArg)
1892 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1893 .lowerVarArgsParameters(Chain, StackSize);
1894
1895 // Some CCs need callee pop.
1896 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
1897 MF.getTarget().Options.GuaranteedTailCallOpt)) {
1898 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1899 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
1900 // X86 interrupts must pop the error code (and the alignment padding) if
1901 // present.
1902 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
1903 } else {
1904 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1905 // If this is an sret function, the return should pop the hidden pointer.
1906 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
1907 FuncInfo->setBytesToPopOnReturn(4);
1908 }
1909
1910 if (!Is64Bit) {
1911 // RegSaveFrameIndex is X86-64 only.
1912 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1913 }
1914
1915 FuncInfo->setArgumentStackSize(StackSize);
1916
1917 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1918 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
1919 if (Personality == EHPersonality::CoreCLR) {
1920 assert(Is64Bit);
1921 // TODO: Add a mechanism to frame lowering that will allow us to indicate
1922 // that we'd prefer this slot be allocated towards the bottom of the frame
1923 // (i.e. near the stack pointer after allocating the frame). Every
1924 // funclet needs a copy of this slot in its (mostly empty) frame, and the
1925 // offset from the bottom of this and each funclet's frame must be the
1926 // same, so the size of funclets' (mostly empty) frames is dictated by
1927 // how far this slot is from the bottom (since they allocate just enough
1928 // space to accommodate holding this slot at the correct offset).
1929 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
1930 EHInfo->PSPSymFrameIdx = PSPSymFI;
1931 }
1932 }
1933
1934 if (shouldDisableArgRegFromCSR(CallConv) ||
1935 F.hasFnAttribute("no_caller_saved_registers")) {
1936 MachineRegisterInfo &MRI = MF.getRegInfo();
1937 for (std::pair<MCRegister, Register> Pair : MRI.liveins())
1938 MRI.disableCalleeSavedRegister(Pair.first);
1939 }
1940
1941 if (CallingConv::PreserveNone == CallConv)
1942 for (const ISD::InputArg &In : Ins) {
1943 if (In.Flags.isSwiftSelf() || In.Flags.isSwiftAsync() ||
1944 In.Flags.isSwiftError()) {
1945 errorUnsupported(DAG, dl,
1946 "Swift attributes can't be used with preserve_none");
1947 break;
1948 }
1949 }
1950
1951 return Chain;
1952 }
1953
LowerMemOpCallTo(SDValue Chain,SDValue StackPtr,SDValue Arg,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,ISD::ArgFlagsTy Flags,bool isByVal) const1954 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1955 SDValue Arg, const SDLoc &dl,
1956 SelectionDAG &DAG,
1957 const CCValAssign &VA,
1958 ISD::ArgFlagsTy Flags,
1959 bool isByVal) const {
1960 unsigned LocMemOffset = VA.getLocMemOffset();
1961 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1962 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1963 StackPtr, PtrOff);
1964 if (isByVal)
1965 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1966
1967 MaybeAlign Alignment;
1968 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1969 Arg.getSimpleValueType() != MVT::f80)
1970 Alignment = MaybeAlign(4);
1971 return DAG.getStore(
1972 Chain, dl, Arg, PtrOff,
1973 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
1974 Alignment);
1975 }
1976
1977 /// Emit a load of return address if tail call
1978 /// optimization is performed and it is required.
EmitTailCallLoadRetAddr(SelectionDAG & DAG,SDValue & OutRetAddr,SDValue Chain,bool IsTailCall,bool Is64Bit,int FPDiff,const SDLoc & dl) const1979 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1980 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1981 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1982 // Adjust the Return address stack slot.
1983 EVT VT = getPointerTy(DAG.getDataLayout());
1984 OutRetAddr = getReturnAddressFrameIndex(DAG);
1985
1986 // Load the "old" Return address.
1987 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
1988 return SDValue(OutRetAddr.getNode(), 1);
1989 }
1990
1991 /// Emit a store of the return address if tail call
1992 /// optimization is performed and it is required (FPDiff!=0).
EmitTailCallStoreRetAddr(SelectionDAG & DAG,MachineFunction & MF,SDValue Chain,SDValue RetAddrFrIdx,EVT PtrVT,unsigned SlotSize,int FPDiff,const SDLoc & dl)1993 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1994 SDValue Chain, SDValue RetAddrFrIdx,
1995 EVT PtrVT, unsigned SlotSize,
1996 int FPDiff, const SDLoc &dl) {
1997 // Store the return address to the appropriate stack slot.
1998 if (!FPDiff) return Chain;
1999 // Calculate the new stack slot for the return address.
2000 int NewReturnAddrFI =
2001 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2002 false);
2003 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2004 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2005 MachinePointerInfo::getFixedStack(
2006 DAG.getMachineFunction(), NewReturnAddrFI));
2007 return Chain;
2008 }
2009
2010 /// Returns a vector_shuffle mask for an movs{s|d}, movd
2011 /// operation of specified width.
getMOVL(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2) const2012 SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
2013 SDValue V1, SDValue V2) const {
2014 unsigned NumElems = VT.getVectorNumElements();
2015 SmallVector<int, 8> Mask;
2016 Mask.push_back(NumElems);
2017 for (unsigned i = 1; i != NumElems; ++i)
2018 Mask.push_back(i);
2019 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
2020 }
2021
2022 SDValue
LowerCall(TargetLowering::CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const2023 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2024 SmallVectorImpl<SDValue> &InVals) const {
2025 SelectionDAG &DAG = CLI.DAG;
2026 SDLoc &dl = CLI.DL;
2027 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2028 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2029 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2030 SDValue Chain = CLI.Chain;
2031 SDValue Callee = CLI.Callee;
2032 CallingConv::ID CallConv = CLI.CallConv;
2033 bool &isTailCall = CLI.IsTailCall;
2034 bool isVarArg = CLI.IsVarArg;
2035 const auto *CB = CLI.CB;
2036
2037 MachineFunction &MF = DAG.getMachineFunction();
2038 bool Is64Bit = Subtarget.is64Bit();
2039 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2040 bool IsSibcall = false;
2041 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
2042 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
2043 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
2044 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2045 bool HasNCSR = (CB && isa<CallInst>(CB) &&
2046 CB->hasFnAttr("no_caller_saved_registers"));
2047 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
2048 bool IsCFICall = IsIndirectCall && CLI.CFIType;
2049 const Module *M = MF.getFunction().getParent();
2050
2051 // If the indirect call target has the nocf_check attribute, the call needs
2052 // the NOTRACK prefix. For simplicity just disable tail calls as there are
2053 // so many variants.
2054 bool IsNoTrackIndirectCall = IsIndirectCall && CB->doesNoCfCheck() &&
2055 M->getModuleFlag("cf-protection-branch");
2056 if (IsNoTrackIndirectCall)
2057 isTailCall = false;
2058
2059 MachineFunction::CallSiteInfo CSInfo;
2060 if (CallConv == CallingConv::X86_INTR)
2061 report_fatal_error("X86 interrupts may not be called directly");
2062
2063 if (IsIndirectCall && !IsWin64 &&
2064 M->getModuleFlag("import-call-optimization"))
2065 errorUnsupported(DAG, dl,
2066 "Indirect calls must have a normal calling convention if "
2067 "Import Call Optimization is enabled");
2068
2069 // Analyze operands of the call, assigning locations to each operand.
2070 SmallVector<CCValAssign, 16> ArgLocs;
2071 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2072
2073 // Allocate shadow area for Win64.
2074 if (IsWin64)
2075 CCInfo.AllocateStack(32, Align(8));
2076
2077 CCInfo.AnalyzeArguments(Outs, CC_X86);
2078
2079 // In vectorcall calling convention a second pass is required for the HVA
2080 // types.
2081 if (CallingConv::X86_VectorCall == CallConv) {
2082 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
2083 }
2084
2085 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2086 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
2087 // If we are using a GOT, disable tail calls to external symbols with
2088 // default visibility. Tail calling such a symbol requires using a GOT
2089 // relocation, which forces early binding of the symbol. This breaks code
2090 // that require lazy function symbol resolution. Using musttail or
2091 // GuaranteedTailCallOpt will override this.
2092 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2093 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2094 G->getGlobal()->hasDefaultVisibility()))
2095 isTailCall = false;
2096 }
2097
2098 if (isTailCall && !IsMustTail) {
2099 // Check if it's really possible to do a tail call.
2100 isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
2101 IsCalleePopSRet);
2102
2103 // Sibcalls are automatically detected tailcalls which do not require
2104 // ABI changes.
2105 if (!IsGuaranteeTCO && isTailCall)
2106 IsSibcall = true;
2107
2108 if (isTailCall)
2109 ++NumTailCalls;
2110 }
2111
2112 if (IsMustTail && !isTailCall)
2113 report_fatal_error("failed to perform tail call elimination on a call "
2114 "site marked musttail");
2115
2116 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2117 "Var args not supported with calling convention fastcc, ghc or hipe");
2118
2119 // Get a count of how many bytes are to be pushed on the stack.
2120 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2121 if (IsSibcall)
2122 // This is a sibcall. The memory operands are available in caller's
2123 // own caller's stack.
2124 NumBytes = 0;
2125 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
2126 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2127
2128 int FPDiff = 0;
2129 if (isTailCall &&
2130 shouldGuaranteeTCO(CallConv,
2131 MF.getTarget().Options.GuaranteedTailCallOpt)) {
2132 // Lower arguments at fp - stackoffset + fpdiff.
2133 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2134
2135 FPDiff = NumBytesCallerPushed - NumBytes;
2136
2137 // Set the delta of movement of the returnaddr stackslot.
2138 // But only set if delta is greater than previous delta.
2139 if (FPDiff < X86Info->getTCReturnAddrDelta())
2140 X86Info->setTCReturnAddrDelta(FPDiff);
2141 }
2142
2143 unsigned NumBytesToPush = NumBytes;
2144 unsigned NumBytesToPop = NumBytes;
2145
2146 // If we have an inalloca argument, all stack space has already been allocated
2147 // for us and be right at the top of the stack. We don't support multiple
2148 // arguments passed in memory when using inalloca.
2149 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2150 NumBytesToPush = 0;
2151 if (!ArgLocs.back().isMemLoc())
2152 report_fatal_error("cannot use inalloca attribute on a register "
2153 "parameter");
2154 if (ArgLocs.back().getLocMemOffset() != 0)
2155 report_fatal_error("any parameter with the inalloca attribute must be "
2156 "the only memory argument");
2157 } else if (CLI.IsPreallocated) {
2158 assert(ArgLocs.back().isMemLoc() &&
2159 "cannot use preallocated attribute on a register "
2160 "parameter");
2161 SmallVector<size_t, 4> PreallocatedOffsets;
2162 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
2163 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
2164 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
2165 }
2166 }
2167 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2168 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
2169 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
2170 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
2171 NumBytesToPush = 0;
2172 }
2173
2174 if (!IsSibcall && !IsMustTail)
2175 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
2176 NumBytes - NumBytesToPush, dl);
2177
2178 SDValue RetAddrFrIdx;
2179 // Load return address for tail calls.
2180 if (isTailCall && FPDiff)
2181 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2182 Is64Bit, FPDiff, dl);
2183
2184 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
2185 SmallVector<SDValue, 8> MemOpChains;
2186 SDValue StackPtr;
2187
2188 // The next loop assumes that the locations are in the same order of the
2189 // input arguments.
2190 assert(isSortedByValueNo(ArgLocs) &&
2191 "Argument Location list must be sorted before lowering");
2192
2193 // Walk the register/memloc assignments, inserting copies/loads. In the case
2194 // of tail call optimization arguments are handle later.
2195 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2196 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
2197 ++I, ++OutIndex) {
2198 assert(OutIndex < Outs.size() && "Invalid Out index");
2199 // Skip inalloca/preallocated arguments, they have already been written.
2200 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
2201 if (Flags.isInAlloca() || Flags.isPreallocated())
2202 continue;
2203
2204 CCValAssign &VA = ArgLocs[I];
2205 EVT RegVT = VA.getLocVT();
2206 SDValue Arg = OutVals[OutIndex];
2207 bool isByVal = Flags.isByVal();
2208
2209 // Promote the value if needed.
2210 switch (VA.getLocInfo()) {
2211 default: llvm_unreachable("Unknown loc info!");
2212 case CCValAssign::Full: break;
2213 case CCValAssign::SExt:
2214 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2215 break;
2216 case CCValAssign::ZExt:
2217 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2218 break;
2219 case CCValAssign::AExt:
2220 if (Arg.getValueType().isVector() &&
2221 Arg.getValueType().getVectorElementType() == MVT::i1)
2222 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
2223 else if (RegVT.is128BitVector()) {
2224 // Special case: passing MMX values in XMM registers.
2225 Arg = DAG.getBitcast(MVT::i64, Arg);
2226 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2227 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2228 } else
2229 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2230 break;
2231 case CCValAssign::BCvt:
2232 Arg = DAG.getBitcast(RegVT, Arg);
2233 break;
2234 case CCValAssign::Indirect: {
2235 if (isByVal) {
2236 // Memcpy the argument to a temporary stack slot to prevent
2237 // the caller from seeing any modifications the callee may make
2238 // as guaranteed by the `byval` attribute.
2239 int FrameIdx = MF.getFrameInfo().CreateStackObject(
2240 Flags.getByValSize(),
2241 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
2242 SDValue StackSlot =
2243 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
2244 Chain =
2245 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
2246 // From now on treat this as a regular pointer
2247 Arg = StackSlot;
2248 isByVal = false;
2249 } else {
2250 // Store the argument.
2251 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2252 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2253 Chain = DAG.getStore(
2254 Chain, dl, Arg, SpillSlot,
2255 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2256 Arg = SpillSlot;
2257 }
2258 break;
2259 }
2260 }
2261
2262 if (VA.needsCustom()) {
2263 assert(VA.getValVT() == MVT::v64i1 &&
2264 "Currently the only custom case is when we split v64i1 to 2 regs");
2265 // Split v64i1 value into two registers
2266 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
2267 } else if (VA.isRegLoc()) {
2268 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2269 const TargetOptions &Options = DAG.getTarget().Options;
2270 if (Options.EmitCallSiteInfo)
2271 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), I);
2272 if (isVarArg && IsWin64) {
2273 // Win64 ABI requires argument XMM reg to be copied to the corresponding
2274 // shadow reg if callee is a varargs function.
2275 Register ShadowReg;
2276 switch (VA.getLocReg()) {
2277 case X86::XMM0: ShadowReg = X86::RCX; break;
2278 case X86::XMM1: ShadowReg = X86::RDX; break;
2279 case X86::XMM2: ShadowReg = X86::R8; break;
2280 case X86::XMM3: ShadowReg = X86::R9; break;
2281 }
2282 if (ShadowReg)
2283 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2284 }
2285 } else if (!IsSibcall && (!isTailCall || isByVal)) {
2286 assert(VA.isMemLoc());
2287 if (!StackPtr.getNode())
2288 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2289 getPointerTy(DAG.getDataLayout()));
2290 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2291 dl, DAG, VA, Flags, isByVal));
2292 }
2293 }
2294
2295 if (!MemOpChains.empty())
2296 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2297
2298 if (Subtarget.isPICStyleGOT()) {
2299 // ELF / PIC requires GOT in the EBX register before function calls via PLT
2300 // GOT pointer (except regcall).
2301 if (!isTailCall) {
2302 // Indirect call with RegCall calling convertion may use up all the
2303 // general registers, so it is not suitable to bind EBX reister for
2304 // GOT address, just let register allocator handle it.
2305 if (CallConv != CallingConv::X86_RegCall)
2306 RegsToPass.push_back(std::make_pair(
2307 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2308 getPointerTy(DAG.getDataLayout()))));
2309 } else {
2310 // If we are tail calling and generating PIC/GOT style code load the
2311 // address of the callee into ECX. The value in ecx is used as target of
2312 // the tail jump. This is done to circumvent the ebx/callee-saved problem
2313 // for tail calls on PIC/GOT architectures. Normally we would just put the
2314 // address of GOT into ebx and then call target@PLT. But for tail calls
2315 // ebx would be restored (since ebx is callee saved) before jumping to the
2316 // target@PLT.
2317
2318 // Note: The actual moving to ECX is done further down.
2319 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2320 if (G && !G->getGlobal()->hasLocalLinkage() &&
2321 G->getGlobal()->hasDefaultVisibility())
2322 Callee = LowerGlobalAddress(Callee, DAG);
2323 else if (isa<ExternalSymbolSDNode>(Callee))
2324 Callee = LowerExternalSymbol(Callee, DAG);
2325 }
2326 }
2327
2328 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2329 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
2330 // From AMD64 ABI document:
2331 // For calls that may call functions that use varargs or stdargs
2332 // (prototype-less calls or calls to functions containing ellipsis (...) in
2333 // the declaration) %al is used as hidden argument to specify the number
2334 // of SSE registers used. The contents of %al do not need to match exactly
2335 // the number of registers, but must be an ubound on the number of SSE
2336 // registers used and is in the range 0 - 8 inclusive.
2337
2338 // Count the number of XMM registers allocated.
2339 static const MCPhysReg XMMArgRegs[] = {
2340 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2341 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2342 };
2343 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
2344 assert((Subtarget.hasSSE1() || !NumXMMRegs)
2345 && "SSE registers cannot be used when SSE is disabled");
2346 RegsToPass.push_back(std::make_pair(Register(X86::AL),
2347 DAG.getConstant(NumXMMRegs, dl,
2348 MVT::i8)));
2349 }
2350
2351 if (isVarArg && IsMustTail) {
2352 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2353 for (const auto &F : Forwards) {
2354 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2355 RegsToPass.push_back(std::make_pair(F.PReg, Val));
2356 }
2357 }
2358
2359 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
2360 // don't need this because the eligibility check rejects calls that require
2361 // shuffling arguments passed in memory.
2362 if (!IsSibcall && isTailCall) {
2363 // Force all the incoming stack arguments to be loaded from the stack
2364 // before any new outgoing arguments or the return address are stored to the
2365 // stack, because the outgoing stack slots may alias the incoming argument
2366 // stack slots, and the alias isn't otherwise explicit. This is slightly
2367 // more conservative than necessary, because it means that each store
2368 // effectively depends on every argument instead of just those arguments it
2369 // would clobber.
2370 Chain = DAG.getStackArgumentTokenFactor(Chain);
2371
2372 SmallVector<SDValue, 8> MemOpChains2;
2373 SDValue FIN;
2374 int FI = 0;
2375 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
2376 ++I, ++OutsIndex) {
2377 CCValAssign &VA = ArgLocs[I];
2378
2379 if (VA.isRegLoc()) {
2380 if (VA.needsCustom()) {
2381 assert((CallConv == CallingConv::X86_RegCall) &&
2382 "Expecting custom case only in regcall calling convention");
2383 // This means that we are in special case where one argument was
2384 // passed through two register locations - Skip the next location
2385 ++I;
2386 }
2387
2388 continue;
2389 }
2390
2391 assert(VA.isMemLoc());
2392 SDValue Arg = OutVals[OutsIndex];
2393 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
2394 // Skip inalloca/preallocated arguments. They don't require any work.
2395 if (Flags.isInAlloca() || Flags.isPreallocated())
2396 continue;
2397 // Create frame index.
2398 int32_t Offset = VA.getLocMemOffset()+FPDiff;
2399 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2400 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
2401 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2402
2403 if (Flags.isByVal()) {
2404 // Copy relative to framepointer.
2405 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
2406 if (!StackPtr.getNode())
2407 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2408 getPointerTy(DAG.getDataLayout()));
2409 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2410 StackPtr, Source);
2411
2412 MemOpChains2.push_back(
2413 CreateCopyOfByValArgument(Source, FIN, Chain, Flags, DAG, dl));
2414 } else {
2415 // Store relative to framepointer.
2416 MemOpChains2.push_back(DAG.getStore(
2417 Chain, dl, Arg, FIN,
2418 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
2419 }
2420 }
2421
2422 if (!MemOpChains2.empty())
2423 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
2424
2425 // Store the return address to the appropriate stack slot.
2426 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2427 getPointerTy(DAG.getDataLayout()),
2428 RegInfo->getSlotSize(), FPDiff, dl);
2429 }
2430
2431 // Build a sequence of copy-to-reg nodes chained together with token chain
2432 // and glue operands which copy the outgoing args into registers.
2433 SDValue InGlue;
2434 for (const auto &[Reg, N] : RegsToPass) {
2435 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2436 InGlue = Chain.getValue(1);
2437 }
2438
2439 bool IsImpCall = false;
2440 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2441 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2442 // In the 64-bit large code model, we have to make all calls
2443 // through a register, since the call instruction's 32-bit
2444 // pc-relative offset may not be large enough to hold the whole
2445 // address.
2446 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
2447 Callee->getOpcode() == ISD::ExternalSymbol) {
2448 // Lower direct calls to global addresses and external symbols. Setting
2449 // ForCall to true here has the effect of removing WrapperRIP when possible
2450 // to allow direct calls to be selected without first materializing the
2451 // address into a register.
2452 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true, &IsImpCall);
2453 } else if (Subtarget.isTarget64BitILP32() &&
2454 Callee.getValueType() == MVT::i32) {
2455 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2456 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
2457 }
2458
2459 SmallVector<SDValue, 8> Ops;
2460
2461 if (!IsSibcall && isTailCall && !IsMustTail) {
2462 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
2463 InGlue = Chain.getValue(1);
2464 }
2465
2466 Ops.push_back(Chain);
2467 Ops.push_back(Callee);
2468
2469 if (isTailCall)
2470 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, dl, MVT::i32));
2471
2472 // Add argument registers to the end of the list so that they are known live
2473 // into the call.
2474 for (const auto &[Reg, N] : RegsToPass)
2475 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2476
2477 // Add a register mask operand representing the call-preserved registers.
2478 const uint32_t *Mask = [&]() {
2479 auto AdaptedCC = CallConv;
2480 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2481 // use X86_INTR calling convention because it has the same CSR mask
2482 // (same preserved registers).
2483 if (HasNCSR)
2484 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2485 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
2486 // to use the CSR_NoRegs_RegMask.
2487 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
2488 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2489 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2490 }();
2491 assert(Mask && "Missing call preserved mask for calling convention");
2492
2493 if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getFramePtr())) {
2494 X86Info->setFPClobberedByCall(true);
2495 if (CLI.CB && isa<InvokeInst>(CLI.CB))
2496 X86Info->setFPClobberedByInvoke(true);
2497 }
2498 if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getBaseRegister())) {
2499 X86Info->setBPClobberedByCall(true);
2500 if (CLI.CB && isa<InvokeInst>(CLI.CB))
2501 X86Info->setBPClobberedByInvoke(true);
2502 }
2503
2504 // If this is an invoke in a 32-bit function using a funclet-based
2505 // personality, assume the function clobbers all registers. If an exception
2506 // is thrown, the runtime will not restore CSRs.
2507 // FIXME: Model this more precisely so that we can register allocate across
2508 // the normal edge and spill and fill across the exceptional edge.
2509 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
2510 const Function &CallerFn = MF.getFunction();
2511 EHPersonality Pers =
2512 CallerFn.hasPersonalityFn()
2513 ? classifyEHPersonality(CallerFn.getPersonalityFn())
2514 : EHPersonality::Unknown;
2515 if (isFuncletEHPersonality(Pers))
2516 Mask = RegInfo->getNoPreservedMask();
2517 }
2518
2519 // Define a new register mask from the existing mask.
2520 uint32_t *RegMask = nullptr;
2521
2522 // In some calling conventions we need to remove the used physical registers
2523 // from the reg mask. Create a new RegMask for such calling conventions.
2524 // RegMask for calling conventions that disable only return registers (e.g.
2525 // preserve_most) will be modified later in LowerCallResult.
2526 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
2527 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
2528 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2529
2530 // Allocate a new Reg Mask and copy Mask.
2531 RegMask = MF.allocateRegMask();
2532 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
2533 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
2534
2535 // Make sure all sub registers of the argument registers are reset
2536 // in the RegMask.
2537 if (ShouldDisableArgRegs) {
2538 for (auto const &RegPair : RegsToPass)
2539 for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
2540 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
2541 }
2542
2543 // Create the RegMask Operand according to our updated mask.
2544 Ops.push_back(DAG.getRegisterMask(RegMask));
2545 } else {
2546 // Create the RegMask Operand according to the static mask.
2547 Ops.push_back(DAG.getRegisterMask(Mask));
2548 }
2549
2550 if (InGlue.getNode())
2551 Ops.push_back(InGlue);
2552
2553 if (isTailCall) {
2554 // We used to do:
2555 //// If this is the first return lowered for this function, add the regs
2556 //// to the liveout set for the function.
2557 // This isn't right, although it's probably harmless on x86; liveouts
2558 // should be computed from returns not tail calls. Consider a void
2559 // function making a tail call to a function returning int.
2560 MF.getFrameInfo().setHasTailCall();
2561 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, Ops);
2562
2563 if (IsCFICall)
2564 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2565
2566 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2567 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2568 return Ret;
2569 }
2570
2571 // Returns a chain & a glue for retval copy to use.
2572 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2573 if (IsImpCall) {
2574 Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops);
2575 } else if (IsNoTrackIndirectCall) {
2576 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
2577 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
2578 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
2579 // expanded to the call, directly followed by a special marker sequence and
2580 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2581 assert(!isTailCall &&
2582 "tail calls cannot be marked with clang.arc.attachedcall");
2583 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2584
2585 // Add a target global address for the retainRV/claimRV runtime function
2586 // just before the call target.
2587 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
2588 auto PtrVT = getPointerTy(DAG.getDataLayout());
2589 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
2590 Ops.insert(Ops.begin() + 1, GA);
2591 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
2592 } else {
2593 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
2594 }
2595
2596 if (IsCFICall)
2597 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2598
2599 InGlue = Chain.getValue(1);
2600 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2601 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2602
2603 // Save heapallocsite metadata.
2604 if (CLI.CB)
2605 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
2606 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
2607
2608 // Create the CALLSEQ_END node.
2609 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
2610 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2611 DAG.getTarget().Options.GuaranteedTailCallOpt))
2612 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
2613 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
2614 // If this call passes a struct-return pointer, the callee
2615 // pops that struct pointer.
2616 NumBytesForCalleeToPop = 4;
2617
2618 // Returns a glue for retval copy to use.
2619 if (!IsSibcall) {
2620 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
2621 InGlue, dl);
2622 InGlue = Chain.getValue(1);
2623 }
2624
2625 if (CallingConv::PreserveNone == CallConv)
2626 for (const ISD::OutputArg &Out : Outs) {
2627 if (Out.Flags.isSwiftSelf() || Out.Flags.isSwiftAsync() ||
2628 Out.Flags.isSwiftError()) {
2629 errorUnsupported(DAG, dl,
2630 "Swift attributes can't be used with preserve_none");
2631 break;
2632 }
2633 }
2634
2635 // Handle result values, copying them out of physregs into vregs that we
2636 // return.
2637 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2638 InVals, RegMask);
2639 }
2640
2641 //===----------------------------------------------------------------------===//
2642 // Fast Calling Convention (tail call) implementation
2643 //===----------------------------------------------------------------------===//
2644
2645 // Like std call, callee cleans arguments, convention except that ECX is
2646 // reserved for storing the tail called function address. Only 2 registers are
2647 // free for argument passing (inreg). Tail call optimization is performed
2648 // provided:
2649 // * tailcallopt is enabled
2650 // * caller/callee are fastcc
2651 // On X86_64 architecture with GOT-style position independent code only local
2652 // (within module) calls are supported at the moment.
2653 // To keep the stack aligned according to platform abi the function
2654 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
2655 // of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2656 // If a tail called function callee has more arguments than the caller the
2657 // caller needs to make sure that there is room to move the RETADDR to. This is
2658 // achieved by reserving an area the size of the argument delta right after the
2659 // original RETADDR, but before the saved framepointer or the spilled registers
2660 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2661 // stack layout:
2662 // arg1
2663 // arg2
2664 // RETADDR
2665 // [ new RETADDR
2666 // move area ]
2667 // (possible EBP)
2668 // ESI
2669 // EDI
2670 // local1 ..
2671
2672 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2673 /// requirement.
2674 unsigned
GetAlignedArgumentStackSize(const unsigned StackSize,SelectionDAG & DAG) const2675 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2676 SelectionDAG &DAG) const {
2677 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2678 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2679 assert(StackSize % SlotSize == 0 &&
2680 "StackSize must be a multiple of SlotSize");
2681 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
2682 }
2683
2684 /// Return true if the given stack call argument is already available in the
2685 /// same position (relatively) of the caller's incoming argument stack.
2686 static
MatchingStackOffset(SDValue Arg,unsigned Offset,ISD::ArgFlagsTy Flags,MachineFrameInfo & MFI,const MachineRegisterInfo * MRI,const X86InstrInfo * TII,const CCValAssign & VA)2687 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2688 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2689 const X86InstrInfo *TII, const CCValAssign &VA) {
2690 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2691
2692 for (;;) {
2693 // Look through nodes that don't alter the bits of the incoming value.
2694 unsigned Op = Arg.getOpcode();
2695 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
2696 Op == ISD::AssertZext) {
2697 Arg = Arg.getOperand(0);
2698 continue;
2699 }
2700 if (Op == ISD::TRUNCATE) {
2701 const SDValue &TruncInput = Arg.getOperand(0);
2702 if (TruncInput.getOpcode() == ISD::AssertZext &&
2703 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
2704 Arg.getValueType()) {
2705 Arg = TruncInput.getOperand(0);
2706 continue;
2707 }
2708 }
2709 break;
2710 }
2711
2712 int FI = INT_MAX;
2713 if (Arg.getOpcode() == ISD::CopyFromReg) {
2714 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2715 if (!VR.isVirtual())
2716 return false;
2717 MachineInstr *Def = MRI->getVRegDef(VR);
2718 if (!Def)
2719 return false;
2720 if (!Flags.isByVal()) {
2721 if (!TII->isLoadFromStackSlot(*Def, FI))
2722 return false;
2723 } else {
2724 unsigned Opcode = Def->getOpcode();
2725 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
2726 Opcode == X86::LEA64_32r) &&
2727 Def->getOperand(1).isFI()) {
2728 FI = Def->getOperand(1).getIndex();
2729 Bytes = Flags.getByValSize();
2730 } else
2731 return false;
2732 }
2733 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2734 if (Flags.isByVal())
2735 // ByVal argument is passed in as a pointer but it's now being
2736 // dereferenced. e.g.
2737 // define @foo(%struct.X* %A) {
2738 // tail call @bar(%struct.X* byval %A)
2739 // }
2740 return false;
2741 SDValue Ptr = Ld->getBasePtr();
2742 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2743 if (!FINode)
2744 return false;
2745 FI = FINode->getIndex();
2746 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2747 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2748 FI = FINode->getIndex();
2749 Bytes = Flags.getByValSize();
2750 } else
2751 return false;
2752
2753 assert(FI != INT_MAX);
2754 if (!MFI.isFixedObjectIndex(FI))
2755 return false;
2756
2757 if (Offset != MFI.getObjectOffset(FI))
2758 return false;
2759
2760 // If this is not byval, check that the argument stack object is immutable.
2761 // inalloca and argument copy elision can create mutable argument stack
2762 // objects. Byval objects can be mutated, but a byval call intends to pass the
2763 // mutated memory.
2764 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
2765 return false;
2766
2767 if (VA.getLocVT().getFixedSizeInBits() >
2768 Arg.getValueSizeInBits().getFixedValue()) {
2769 // If the argument location is wider than the argument type, check that any
2770 // extension flags match.
2771 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
2772 Flags.isSExt() != MFI.isObjectSExt(FI)) {
2773 return false;
2774 }
2775 }
2776
2777 return Bytes == MFI.getObjectSize(FI);
2778 }
2779
2780 /// Check whether the call is eligible for tail call optimization. Targets
2781 /// that want to do tail call optimization should implement this function.
2782 /// Note that the x86 backend does not check musttail calls for eligibility! The
2783 /// rest of x86 tail call lowering must be prepared to forward arguments of any
2784 /// type.
IsEligibleForTailCallOptimization(TargetLowering::CallLoweringInfo & CLI,CCState & CCInfo,SmallVectorImpl<CCValAssign> & ArgLocs,bool IsCalleePopSRet) const2785 bool X86TargetLowering::IsEligibleForTailCallOptimization(
2786 TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
2787 SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const {
2788 SelectionDAG &DAG = CLI.DAG;
2789 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2790 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2791 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2792 SDValue Callee = CLI.Callee;
2793 CallingConv::ID CalleeCC = CLI.CallConv;
2794 bool isVarArg = CLI.IsVarArg;
2795
2796 if (!mayTailCallThisCC(CalleeCC))
2797 return false;
2798
2799 // If -tailcallopt is specified, make fastcc functions tail-callable.
2800 MachineFunction &MF = DAG.getMachineFunction();
2801 const Function &CallerF = MF.getFunction();
2802
2803 // If the function return type is x86_fp80 and the callee return type is not,
2804 // then the FP_EXTEND of the call result is not a nop. It's not safe to
2805 // perform a tailcall optimization here.
2806 if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty())
2807 return false;
2808
2809 CallingConv::ID CallerCC = CallerF.getCallingConv();
2810 bool CCMatch = CallerCC == CalleeCC;
2811 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
2812 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
2813 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
2814 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
2815
2816 // Win64 functions have extra shadow space for argument homing. Don't do the
2817 // sibcall if the caller and callee have mismatched expectations for this
2818 // space.
2819 if (IsCalleeWin64 != IsCallerWin64)
2820 return false;
2821
2822 if (IsGuaranteeTCO) {
2823 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2824 return true;
2825 return false;
2826 }
2827
2828 // Look for obvious safe cases to perform tail call optimization that do not
2829 // require ABI changes. This is what gcc calls sibcall.
2830
2831 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2832 // emit a special epilogue.
2833 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2834 if (RegInfo->hasStackRealignment(MF))
2835 return false;
2836
2837 // Also avoid sibcall optimization if we're an sret return fn and the callee
2838 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
2839 // insufficient.
2840 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
2841 // For a compatible tail call the callee must return our sret pointer. So it
2842 // needs to be (a) an sret function itself and (b) we pass our sret as its
2843 // sret. Condition #b is harder to determine.
2844 return false;
2845 } else if (IsCalleePopSRet)
2846 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
2847 // expect that.
2848 return false;
2849
2850 // Do not sibcall optimize vararg calls unless all arguments are passed via
2851 // registers.
2852 LLVMContext &C = *DAG.getContext();
2853 if (isVarArg && !Outs.empty()) {
2854 // Optimizing for varargs on Win64 is unlikely to be safe without
2855 // additional testing.
2856 if (IsCalleeWin64 || IsCallerWin64)
2857 return false;
2858
2859 for (const auto &VA : ArgLocs)
2860 if (!VA.isRegLoc())
2861 return false;
2862 }
2863
2864 // If the call result is in ST0 / ST1, it needs to be popped off the x87
2865 // stack. Therefore, if it's not used by the call it is not safe to optimize
2866 // this into a sibcall.
2867 bool Unused = false;
2868 for (const auto &In : Ins) {
2869 if (!In.Used) {
2870 Unused = true;
2871 break;
2872 }
2873 }
2874 if (Unused) {
2875 SmallVector<CCValAssign, 16> RVLocs;
2876 CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C);
2877 RVCCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2878 for (const auto &VA : RVLocs) {
2879 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
2880 return false;
2881 }
2882 }
2883
2884 // Check that the call results are passed in the same way.
2885 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2886 RetCC_X86, RetCC_X86))
2887 return false;
2888 // The callee has to preserve all registers the caller needs to preserve.
2889 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2890 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2891 if (!CCMatch) {
2892 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2893 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2894 return false;
2895 }
2896
2897 // The stack frame of the caller cannot be replaced by the tail-callee one's
2898 // if the function is required to preserve all the registers. Conservatively
2899 // prevent tail optimization even if hypothetically all the registers are used
2900 // for passing formal parameters or returning values.
2901 if (CallerF.hasFnAttribute("no_caller_saved_registers"))
2902 return false;
2903
2904 unsigned StackArgsSize = CCInfo.getStackSize();
2905
2906 // If the callee takes no arguments then go on to check the results of the
2907 // call.
2908 if (!Outs.empty()) {
2909 if (StackArgsSize > 0) {
2910 // Check if the arguments are already laid out in the right way as
2911 // the caller's fixed stack objects.
2912 MachineFrameInfo &MFI = MF.getFrameInfo();
2913 const MachineRegisterInfo *MRI = &MF.getRegInfo();
2914 const X86InstrInfo *TII = Subtarget.getInstrInfo();
2915 for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
2916 const CCValAssign &VA = ArgLocs[I];
2917 SDValue Arg = OutVals[I];
2918 ISD::ArgFlagsTy Flags = Outs[I].Flags;
2919 if (VA.getLocInfo() == CCValAssign::Indirect)
2920 return false;
2921 if (!VA.isRegLoc()) {
2922 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
2923 TII, VA))
2924 return false;
2925 }
2926 }
2927 }
2928
2929 bool PositionIndependent = isPositionIndependent();
2930 // If the tailcall address may be in a register, then make sure it's
2931 // possible to register allocate for it. In 32-bit, the call address can
2932 // only target EAX, EDX, or ECX since the tail call must be scheduled after
2933 // callee-saved registers are restored. These happen to be the same
2934 // registers used to pass 'inreg' arguments so watch out for those.
2935 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
2936 !isa<ExternalSymbolSDNode>(Callee)) ||
2937 PositionIndependent)) {
2938 unsigned NumInRegs = 0;
2939 // In PIC we need an extra register to formulate the address computation
2940 // for the callee.
2941 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
2942
2943 for (const auto &VA : ArgLocs) {
2944 if (!VA.isRegLoc())
2945 continue;
2946 Register Reg = VA.getLocReg();
2947 switch (Reg) {
2948 default: break;
2949 case X86::EAX: case X86::EDX: case X86::ECX:
2950 if (++NumInRegs == MaxInRegs)
2951 return false;
2952 break;
2953 }
2954 }
2955 }
2956
2957 const MachineRegisterInfo &MRI = MF.getRegInfo();
2958 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2959 return false;
2960 }
2961
2962 bool CalleeWillPop =
2963 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
2964 MF.getTarget().Options.GuaranteedTailCallOpt);
2965
2966 if (unsigned BytesToPop =
2967 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
2968 // If we have bytes to pop, the callee must pop them.
2969 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
2970 if (!CalleePopMatches)
2971 return false;
2972 } else if (CalleeWillPop && StackArgsSize > 0) {
2973 // If we don't have bytes to pop, make sure the callee doesn't pop any.
2974 return false;
2975 }
2976
2977 return true;
2978 }
2979
2980 /// Determines whether the callee is required to pop its own arguments.
2981 /// Callee pop is necessary to support tail calls.
isCalleePop(CallingConv::ID CallingConv,bool is64Bit,bool IsVarArg,bool GuaranteeTCO)2982 bool X86::isCalleePop(CallingConv::ID CallingConv,
2983 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
2984 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
2985 // can guarantee TCO.
2986 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
2987 return true;
2988
2989 switch (CallingConv) {
2990 default:
2991 return false;
2992 case CallingConv::X86_StdCall:
2993 case CallingConv::X86_FastCall:
2994 case CallingConv::X86_ThisCall:
2995 case CallingConv::X86_VectorCall:
2996 return !is64Bit;
2997 }
2998 }
2999