1 //===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file implements the lowering of LLVM calls to DAG nodes. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86CallingConv.h" 16 #include "X86FrameLowering.h" 17 #include "X86ISelLowering.h" 18 #include "X86InstrBuilder.h" 19 #include "X86MachineFunctionInfo.h" 20 #include "X86TargetMachine.h" 21 #include "X86TargetObjectFile.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/ObjCARCUtil.h" 24 #include "llvm/CodeGen/MachineJumpTableInfo.h" 25 #include "llvm/CodeGen/MachineModuleInfo.h" 26 #include "llvm/CodeGen/WinEHFuncInfo.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IRBuilder.h" 29 30 #define DEBUG_TYPE "x86-isel" 31 32 using namespace llvm; 33 34 STATISTIC(NumTailCalls, "Number of tail calls"); 35 36 /// Call this when the user attempts to do something unsupported, like 37 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike 38 /// report_fatal_error, so calling code should attempt to recover without 39 /// crashing. 40 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, 41 const char *Msg) { 42 MachineFunction &MF = DAG.getMachineFunction(); 43 DAG.getContext()->diagnose( 44 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); 45 } 46 47 /// Returns true if a CC can dynamically exclude a register from the list of 48 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on 49 /// the return registers. 50 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) { 51 switch (CC) { 52 default: 53 return false; 54 case CallingConv::X86_RegCall: 55 case CallingConv::PreserveMost: 56 case CallingConv::PreserveAll: 57 return true; 58 } 59 } 60 61 /// Returns true if a CC can dynamically exclude a register from the list of 62 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on 63 /// the parameters. 64 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) { 65 return CC == CallingConv::X86_RegCall; 66 } 67 68 static std::pair<MVT, unsigned> 69 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, 70 const X86Subtarget &Subtarget) { 71 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling 72 // convention is one that uses k registers. 73 if (NumElts == 2) 74 return {MVT::v2i64, 1}; 75 if (NumElts == 4) 76 return {MVT::v4i32, 1}; 77 if (NumElts == 8 && CC != CallingConv::X86_RegCall && 78 CC != CallingConv::Intel_OCL_BI) 79 return {MVT::v8i16, 1}; 80 if (NumElts == 16 && CC != CallingConv::X86_RegCall && 81 CC != CallingConv::Intel_OCL_BI) 82 return {MVT::v16i8, 1}; 83 // v32i1 passes in ymm unless we have BWI and the calling convention is 84 // regcall. 85 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) 86 return {MVT::v32i8, 1}; 87 // Split v64i1 vectors if we don't have v64i8 available. 88 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { 89 if (Subtarget.useAVX512Regs()) 90 return {MVT::v64i8, 1}; 91 return {MVT::v32i8, 2}; 92 } 93 94 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. 95 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || 96 NumElts > 64) 97 return {MVT::i8, NumElts}; 98 99 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; 100 } 101 102 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 103 CallingConv::ID CC, 104 EVT VT) const { 105 if (VT.isVector()) { 106 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { 107 unsigned NumElts = VT.getVectorNumElements(); 108 109 MVT RegisterVT; 110 unsigned NumRegisters; 111 std::tie(RegisterVT, NumRegisters) = 112 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); 113 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) 114 return RegisterVT; 115 } 116 117 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) 118 return MVT::v8f16; 119 } 120 121 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. 122 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && 123 !Subtarget.hasX87()) 124 return MVT::i32; 125 126 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) 127 return getRegisterTypeForCallingConv(Context, CC, 128 VT.changeVectorElementType(MVT::f16)); 129 130 if (VT == MVT::bf16) 131 return MVT::f16; 132 133 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 134 } 135 136 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 137 CallingConv::ID CC, 138 EVT VT) const { 139 if (VT.isVector()) { 140 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { 141 unsigned NumElts = VT.getVectorNumElements(); 142 143 MVT RegisterVT; 144 unsigned NumRegisters; 145 std::tie(RegisterVT, NumRegisters) = 146 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); 147 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) 148 return NumRegisters; 149 } 150 151 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) 152 return 1; 153 } 154 155 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if 156 // x87 is disabled. 157 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) { 158 if (VT == MVT::f64) 159 return 2; 160 if (VT == MVT::f80) 161 return 3; 162 } 163 164 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) 165 return getNumRegistersForCallingConv(Context, CC, 166 VT.changeVectorElementType(MVT::f16)); 167 168 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 169 } 170 171 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( 172 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 173 unsigned &NumIntermediates, MVT &RegisterVT) const { 174 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. 175 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && 176 Subtarget.hasAVX512() && 177 (!isPowerOf2_32(VT.getVectorNumElements()) || 178 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || 179 VT.getVectorNumElements() > 64)) { 180 RegisterVT = MVT::i8; 181 IntermediateVT = MVT::i1; 182 NumIntermediates = VT.getVectorNumElements(); 183 return NumIntermediates; 184 } 185 186 // Split v64i1 vectors if we don't have v64i8 available. 187 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && 188 CC != CallingConv::X86_RegCall) { 189 RegisterVT = MVT::v32i8; 190 IntermediateVT = MVT::v32i1; 191 NumIntermediates = 2; 192 return 2; 193 } 194 195 // Split vNbf16 vectors according to vNf16. 196 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) 197 VT = VT.changeVectorElementType(MVT::f16); 198 199 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, 200 NumIntermediates, RegisterVT); 201 } 202 203 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, 204 LLVMContext& Context, 205 EVT VT) const { 206 if (!VT.isVector()) 207 return MVT::i8; 208 209 if (Subtarget.hasAVX512()) { 210 // Figure out what this type will be legalized to. 211 EVT LegalVT = VT; 212 while (getTypeAction(Context, LegalVT) != TypeLegal) 213 LegalVT = getTypeToTransformTo(Context, LegalVT); 214 215 // If we got a 512-bit vector then we'll definitely have a vXi1 compare. 216 if (LegalVT.getSimpleVT().is512BitVector()) 217 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); 218 219 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { 220 // If we legalized to less than a 512-bit vector, then we will use a vXi1 221 // compare for vXi32/vXi64 for sure. If we have BWI we will also support 222 // vXi16/vXi8. 223 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); 224 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) 225 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); 226 } 227 } 228 229 return VT.changeVectorElementTypeToInteger(); 230 } 231 232 /// Helper for getByValTypeAlignment to determine 233 /// the desired ByVal argument alignment. 234 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { 235 if (MaxAlign == 16) 236 return; 237 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 238 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128) 239 MaxAlign = Align(16); 240 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 241 Align EltAlign; 242 getMaxByValAlign(ATy->getElementType(), EltAlign); 243 if (EltAlign > MaxAlign) 244 MaxAlign = EltAlign; 245 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 246 for (auto *EltTy : STy->elements()) { 247 Align EltAlign; 248 getMaxByValAlign(EltTy, EltAlign); 249 if (EltAlign > MaxAlign) 250 MaxAlign = EltAlign; 251 if (MaxAlign == 16) 252 break; 253 } 254 } 255 } 256 257 /// Return the desired alignment for ByVal aggregate 258 /// function arguments in the caller parameter area. For X86, aggregates 259 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 260 /// are at 4-byte boundaries. 261 uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty, 262 const DataLayout &DL) const { 263 if (Subtarget.is64Bit()) { 264 // Max of 8 and alignment of type. 265 Align TyAlign = DL.getABITypeAlign(Ty); 266 if (TyAlign > 8) 267 return TyAlign.value(); 268 return 8; 269 } 270 271 Align Alignment(4); 272 if (Subtarget.hasSSE1()) 273 getMaxByValAlign(Ty, Alignment); 274 return Alignment.value(); 275 } 276 277 /// It returns EVT::Other if the type should be determined using generic 278 /// target-independent logic. 279 /// For vector ops we check that the overall size isn't larger than our 280 /// preferred vector width. 281 EVT X86TargetLowering::getOptimalMemOpType( 282 const MemOp &Op, const AttributeList &FuncAttributes) const { 283 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { 284 if (Op.size() >= 16 && 285 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { 286 // FIXME: Check if unaligned 64-byte accesses are slow. 287 if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() && 288 (Subtarget.getPreferVectorWidth() >= 512)) { 289 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; 290 } 291 // FIXME: Check if unaligned 32-byte accesses are slow. 292 if (Op.size() >= 32 && Subtarget.hasAVX() && 293 Subtarget.useLight256BitInstructions()) { 294 // Although this isn't a well-supported type for AVX1, we'll let 295 // legalization and shuffle lowering produce the optimal codegen. If we 296 // choose an optimal type with a vector element larger than a byte, 297 // getMemsetStores() may create an intermediate splat (using an integer 298 // multiply) before we splat as a vector. 299 return MVT::v32i8; 300 } 301 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) 302 return MVT::v16i8; 303 // TODO: Can SSE1 handle a byte vector? 304 // If we have SSE1 registers we should be able to use them. 305 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && 306 (Subtarget.getPreferVectorWidth() >= 128)) 307 return MVT::v4f32; 308 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) && 309 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { 310 // Do not use f64 to lower memcpy if source is string constant. It's 311 // better to use i32 to avoid the loads. 312 // Also, do not use f64 to lower memset unless this is a memset of zeros. 313 // The gymnastics of splatting a byte value into an XMM register and then 314 // only using 8-byte stores (because this is a CPU with slow unaligned 315 // 16-byte accesses) makes that a loser. 316 return MVT::f64; 317 } 318 } 319 // This is a compromise. If we reach here, unaligned accesses may be slow on 320 // this target. However, creating smaller, aligned accesses could be even 321 // slower and would certainly be a lot more code. 322 if (Subtarget.is64Bit() && Op.size() >= 8) 323 return MVT::i64; 324 return MVT::i32; 325 } 326 327 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 328 if (VT == MVT::f32) 329 return Subtarget.hasSSE1(); 330 if (VT == MVT::f64) 331 return Subtarget.hasSSE2(); 332 return true; 333 } 334 335 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) { 336 return (8 * Alignment.value()) % SizeInBits == 0; 337 } 338 339 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const { 340 if (isBitAligned(Alignment, VT.getSizeInBits())) 341 return true; 342 switch (VT.getSizeInBits()) { 343 default: 344 // 8-byte and under are always assumed to be fast. 345 return true; 346 case 128: 347 return !Subtarget.isUnalignedMem16Slow(); 348 case 256: 349 return !Subtarget.isUnalignedMem32Slow(); 350 // TODO: What about AVX-512 (512-bit) accesses? 351 } 352 } 353 354 bool X86TargetLowering::allowsMisalignedMemoryAccesses( 355 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags, 356 unsigned *Fast) const { 357 if (Fast) 358 *Fast = isMemoryAccessFast(VT, Alignment); 359 // NonTemporal vector memory ops must be aligned. 360 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { 361 // NT loads can only be vector aligned, so if its less aligned than the 362 // minimum vector size (which we can split the vector down to), we might as 363 // well use a regular unaligned vector load. 364 // We don't have any NT loads pre-SSE41. 365 if (!!(Flags & MachineMemOperand::MOLoad)) 366 return (Alignment < 16 || !Subtarget.hasSSE41()); 367 return false; 368 } 369 // Misaligned accesses of any size are always allowed. 370 return true; 371 } 372 373 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, 374 const DataLayout &DL, EVT VT, 375 unsigned AddrSpace, Align Alignment, 376 MachineMemOperand::Flags Flags, 377 unsigned *Fast) const { 378 if (Fast) 379 *Fast = isMemoryAccessFast(VT, Alignment); 380 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { 381 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, 382 /*Fast=*/nullptr)) 383 return true; 384 // NonTemporal vector memory ops are special, and must be aligned. 385 if (!isBitAligned(Alignment, VT.getSizeInBits())) 386 return false; 387 switch (VT.getSizeInBits()) { 388 case 128: 389 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41()) 390 return true; 391 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2()) 392 return true; 393 return false; 394 case 256: 395 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2()) 396 return true; 397 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX()) 398 return true; 399 return false; 400 case 512: 401 if (Subtarget.hasAVX512() && Subtarget.hasEVEX512()) 402 return true; 403 return false; 404 default: 405 return false; // Don't have NonTemporal vector memory ops of this size. 406 } 407 } 408 return true; 409 } 410 411 /// Return the entry encoding for a jump table in the 412 /// current function. The returned value is a member of the 413 /// MachineJumpTableInfo::JTEntryKind enum. 414 unsigned X86TargetLowering::getJumpTableEncoding() const { 415 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 416 // symbol. 417 if (isPositionIndependent() && Subtarget.isPICStyleGOT()) 418 return MachineJumpTableInfo::EK_Custom32; 419 if (isPositionIndependent() && 420 getTargetMachine().getCodeModel() == CodeModel::Large) 421 return MachineJumpTableInfo::EK_LabelDifference64; 422 423 // Otherwise, use the normal jump table encoding heuristics. 424 return TargetLowering::getJumpTableEncoding(); 425 } 426 427 bool X86TargetLowering::useSoftFloat() const { 428 return Subtarget.useSoftFloat(); 429 } 430 431 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, 432 ArgListTy &Args) const { 433 434 // Only relabel X86-32 for C / Stdcall CCs. 435 if (Subtarget.is64Bit()) 436 return; 437 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) 438 return; 439 unsigned ParamRegs = 0; 440 if (auto *M = MF->getFunction().getParent()) 441 ParamRegs = M->getNumberRegisterParameters(); 442 443 // Mark the first N int arguments as having reg 444 for (auto &Arg : Args) { 445 Type *T = Arg.Ty; 446 if (T->isIntOrPtrTy()) 447 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { 448 unsigned numRegs = 1; 449 if (MF->getDataLayout().getTypeAllocSize(T) > 4) 450 numRegs = 2; 451 if (ParamRegs < numRegs) 452 return; 453 ParamRegs -= numRegs; 454 Arg.IsInReg = true; 455 } 456 } 457 } 458 459 const MCExpr * 460 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 461 const MachineBasicBlock *MBB, 462 unsigned uid,MCContext &Ctx) const{ 463 assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); 464 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 465 // entries. 466 return MCSymbolRefExpr::create(MBB->getSymbol(), 467 MCSymbolRefExpr::VK_GOTOFF, Ctx); 468 } 469 470 /// Returns relocation base for the given PIC jumptable. 471 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 472 SelectionDAG &DAG) const { 473 if (!Subtarget.is64Bit()) 474 // This doesn't have SDLoc associated with it, but is not really the 475 // same as a Register. 476 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 477 getPointerTy(DAG.getDataLayout())); 478 return Table; 479 } 480 481 /// This returns the relocation base for the given PIC jumptable, 482 /// the same as getPICJumpTableRelocBase, but as an MCExpr. 483 const MCExpr *X86TargetLowering:: 484 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 485 MCContext &Ctx) const { 486 // X86-64 uses RIP relative addressing based on the jump table label. 487 if (Subtarget.isPICStyleRIPRel() || 488 (Subtarget.is64Bit() && 489 getTargetMachine().getCodeModel() == CodeModel::Large)) 490 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 491 492 // Otherwise, the reference is relative to the PIC base. 493 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 494 } 495 496 std::pair<const TargetRegisterClass *, uint8_t> 497 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 498 MVT VT) const { 499 const TargetRegisterClass *RRC = nullptr; 500 uint8_t Cost = 1; 501 switch (VT.SimpleTy) { 502 default: 503 return TargetLowering::findRepresentativeClass(TRI, VT); 504 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 505 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; 506 break; 507 case MVT::x86mmx: 508 RRC = &X86::VR64RegClass; 509 break; 510 case MVT::f32: case MVT::f64: 511 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 512 case MVT::v4f32: case MVT::v2f64: 513 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: 514 case MVT::v8f32: case MVT::v4f64: 515 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: 516 case MVT::v16f32: case MVT::v8f64: 517 RRC = &X86::VR128XRegClass; 518 break; 519 } 520 return std::make_pair(RRC, Cost); 521 } 522 523 unsigned X86TargetLowering::getAddressSpace() const { 524 if (Subtarget.is64Bit()) 525 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; 526 return 256; 527 } 528 529 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { 530 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || 531 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); 532 } 533 534 static Constant* SegmentOffset(IRBuilderBase &IRB, 535 int Offset, unsigned AddressSpace) { 536 return ConstantExpr::getIntToPtr( 537 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), 538 IRB.getPtrTy(AddressSpace)); 539 } 540 541 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { 542 // glibc, bionic, and Fuchsia have a special slot for the stack guard in 543 // tcbhead_t; use it instead of the usual global variable (see 544 // sysdeps/{i386,x86_64}/nptl/tls.h) 545 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { 546 unsigned AddressSpace = getAddressSpace(); 547 548 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 549 if (Subtarget.isTargetFuchsia()) 550 return SegmentOffset(IRB, 0x10, AddressSpace); 551 552 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 553 // Specially, some users may customize the base reg and offset. 554 int Offset = M->getStackProtectorGuardOffset(); 555 // If we don't set -stack-protector-guard-offset value: 556 // %fs:0x28, unless we're using a Kernel code model, in which case 557 // it's %gs:0x28. gs:0x14 on i386. 558 if (Offset == INT_MAX) 559 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; 560 561 StringRef GuardReg = M->getStackProtectorGuardReg(); 562 if (GuardReg == "fs") 563 AddressSpace = X86AS::FS; 564 else if (GuardReg == "gs") 565 AddressSpace = X86AS::GS; 566 567 // Use symbol guard if user specify. 568 StringRef GuardSymb = M->getStackProtectorGuardSymbol(); 569 if (!GuardSymb.empty()) { 570 GlobalVariable *GV = M->getGlobalVariable(GuardSymb); 571 if (!GV) { 572 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) 573 : Type::getInt32Ty(M->getContext()); 574 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 575 nullptr, GuardSymb, nullptr, 576 GlobalValue::NotThreadLocal, AddressSpace); 577 if (!Subtarget.isTargetDarwin()) 578 GV->setDSOLocal(M->getDirectAccessExternalData()); 579 } 580 return GV; 581 } 582 583 return SegmentOffset(IRB, Offset, AddressSpace); 584 } 585 return TargetLowering::getIRStackGuard(IRB); 586 } 587 588 void X86TargetLowering::insertSSPDeclarations(Module &M) const { 589 // MSVC CRT provides functionalities for stack protection. 590 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 591 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 592 // MSVC CRT has a global variable holding security cookie. 593 M.getOrInsertGlobal("__security_cookie", 594 PointerType::getUnqual(M.getContext())); 595 596 // MSVC CRT has a function to validate security cookie. 597 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 598 "__security_check_cookie", Type::getVoidTy(M.getContext()), 599 PointerType::getUnqual(M.getContext())); 600 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 601 F->setCallingConv(CallingConv::X86_FastCall); 602 F->addParamAttr(0, Attribute::AttrKind::InReg); 603 } 604 return; 605 } 606 607 StringRef GuardMode = M.getStackProtectorGuard(); 608 609 // glibc, bionic, and Fuchsia have a special slot for the stack guard. 610 if ((GuardMode == "tls" || GuardMode.empty()) && 611 hasStackGuardSlotTLS(Subtarget.getTargetTriple())) 612 return; 613 TargetLowering::insertSSPDeclarations(M); 614 } 615 616 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { 617 // MSVC CRT has a global variable holding security cookie. 618 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 619 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 620 return M.getGlobalVariable("__security_cookie"); 621 } 622 return TargetLowering::getSDagStackGuard(M); 623 } 624 625 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { 626 // MSVC CRT has a function to validate security cookie. 627 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 628 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 629 return M.getFunction("__security_check_cookie"); 630 } 631 return TargetLowering::getSSPStackGuardCheck(M); 632 } 633 634 Value * 635 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { 636 // Android provides a fixed TLS slot for the SafeStack pointer. See the 637 // definition of TLS_SLOT_SAFESTACK in 638 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 639 if (Subtarget.isTargetAndroid()) { 640 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: 641 // %gs:0x24 on i386 642 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; 643 return SegmentOffset(IRB, Offset, getAddressSpace()); 644 } 645 646 // Fuchsia is similar. 647 if (Subtarget.isTargetFuchsia()) { 648 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 649 return SegmentOffset(IRB, 0x18, getAddressSpace()); 650 } 651 652 return TargetLowering::getSafeStackPointerLocation(IRB); 653 } 654 655 //===----------------------------------------------------------------------===// 656 // Return Value Calling Convention Implementation 657 //===----------------------------------------------------------------------===// 658 659 bool X86TargetLowering::CanLowerReturn( 660 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 661 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 662 SmallVector<CCValAssign, 16> RVLocs; 663 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 664 return CCInfo.CheckReturn(Outs, RetCC_X86); 665 } 666 667 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 668 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; 669 return ScratchRegs; 670 } 671 672 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const { 673 // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit 674 // tests at the moment, which is not what we expected. 675 static const MCPhysReg RCRegs[] = {X86::MXCSR}; 676 return RCRegs; 677 } 678 679 /// Lowers masks values (v*i1) to the local register values 680 /// \returns DAG node after lowering to register type 681 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, 682 const SDLoc &DL, SelectionDAG &DAG) { 683 EVT ValVT = ValArg.getValueType(); 684 685 if (ValVT == MVT::v1i1) 686 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg, 687 DAG.getIntPtrConstant(0, DL)); 688 689 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || 690 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { 691 // Two stage lowering might be required 692 // bitcast: v8i1 -> i8 / v16i1 -> i16 693 // anyextend: i8 -> i32 / i16 -> i32 694 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; 695 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); 696 if (ValLoc == MVT::i32) 697 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy); 698 return ValToCopy; 699 } 700 701 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || 702 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { 703 // One stage lowering is required 704 // bitcast: v32i1 -> i32 / v64i1 -> i64 705 return DAG.getBitcast(ValLoc, ValArg); 706 } 707 708 return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg); 709 } 710 711 /// Breaks v64i1 value into two registers and adds the new node to the DAG 712 static void Passv64i1ArgInRegs( 713 const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg, 714 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA, 715 CCValAssign &NextVA, const X86Subtarget &Subtarget) { 716 assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); 717 assert(Subtarget.is32Bit() && "Expecting 32 bit target"); 718 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); 719 assert(VA.isRegLoc() && NextVA.isRegLoc() && 720 "The value should reside in two registers"); 721 722 // Before splitting the value we cast it to i64 723 Arg = DAG.getBitcast(MVT::i64, Arg); 724 725 // Splitting the value into two i32 types 726 SDValue Lo, Hi; 727 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32); 728 729 // Attach the two i32 types into corresponding registers 730 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); 731 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); 732 } 733 734 SDValue 735 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 736 bool isVarArg, 737 const SmallVectorImpl<ISD::OutputArg> &Outs, 738 const SmallVectorImpl<SDValue> &OutVals, 739 const SDLoc &dl, SelectionDAG &DAG) const { 740 MachineFunction &MF = DAG.getMachineFunction(); 741 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 742 743 // In some cases we need to disable registers from the default CSR list. 744 // For example, when they are used as return registers (preserve_* and X86's 745 // regcall) or for argument passing (X86's regcall). 746 bool ShouldDisableCalleeSavedRegister = 747 shouldDisableRetRegFromCSR(CallConv) || 748 MF.getFunction().hasFnAttribute("no_caller_saved_registers"); 749 750 if (CallConv == CallingConv::X86_INTR && !Outs.empty()) 751 report_fatal_error("X86 interrupts may not return any value"); 752 753 SmallVector<CCValAssign, 16> RVLocs; 754 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 755 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 756 757 SmallVector<std::pair<Register, SDValue>, 4> RetVals; 758 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; 759 ++I, ++OutsIndex) { 760 CCValAssign &VA = RVLocs[I]; 761 assert(VA.isRegLoc() && "Can only return in registers!"); 762 763 // Add the register to the CalleeSaveDisableRegs list. 764 if (ShouldDisableCalleeSavedRegister) 765 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); 766 767 SDValue ValToCopy = OutVals[OutsIndex]; 768 EVT ValVT = ValToCopy.getValueType(); 769 770 // Promote values to the appropriate types. 771 if (VA.getLocInfo() == CCValAssign::SExt) 772 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 773 else if (VA.getLocInfo() == CCValAssign::ZExt) 774 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 775 else if (VA.getLocInfo() == CCValAssign::AExt) { 776 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) 777 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); 778 else 779 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 780 } 781 else if (VA.getLocInfo() == CCValAssign::BCvt) 782 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); 783 784 assert(VA.getLocInfo() != CCValAssign::FPExt && 785 "Unexpected FP-extend for return value."); 786 787 // Report an error if we have attempted to return a value via an XMM 788 // register and SSE was disabled. 789 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { 790 errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); 791 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 792 } else if (!Subtarget.hasSSE2() && 793 X86::FR64XRegClass.contains(VA.getLocReg()) && 794 ValVT == MVT::f64) { 795 // When returning a double via an XMM register, report an error if SSE2 is 796 // not enabled. 797 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); 798 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 799 } 800 801 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 802 // the RET instruction and handled by the FP Stackifier. 803 if (VA.getLocReg() == X86::FP0 || 804 VA.getLocReg() == X86::FP1) { 805 // If this is a copy from an xmm register to ST(0), use an FPExtend to 806 // change the value to the FP stack register class. 807 if (isScalarFPTypeInSSEReg(VA.getValVT())) 808 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 809 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); 810 // Don't emit a copytoreg. 811 continue; 812 } 813 814 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 815 // which is returned in RAX / RDX. 816 if (Subtarget.is64Bit()) { 817 if (ValVT == MVT::x86mmx) { 818 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 819 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); 820 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 821 ValToCopy); 822 // If we don't have SSE2 available, convert to v4f32 so the generated 823 // register is legal. 824 if (!Subtarget.hasSSE2()) 825 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); 826 } 827 } 828 } 829 830 if (VA.needsCustom()) { 831 assert(VA.getValVT() == MVT::v64i1 && 832 "Currently the only custom case is when we split v64i1 to 2 regs"); 833 834 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I], 835 Subtarget); 836 837 // Add the second register to the CalleeSaveDisableRegs list. 838 if (ShouldDisableCalleeSavedRegister) 839 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); 840 } else { 841 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); 842 } 843 } 844 845 SDValue Glue; 846 SmallVector<SDValue, 6> RetOps; 847 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 848 // Operand #1 = Bytes To Pop 849 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, 850 MVT::i32)); 851 852 // Copy the result values into the output registers. 853 for (auto &RetVal : RetVals) { 854 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) { 855 RetOps.push_back(RetVal.second); 856 continue; // Don't emit a copytoreg. 857 } 858 859 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue); 860 Glue = Chain.getValue(1); 861 RetOps.push_back( 862 DAG.getRegister(RetVal.first, RetVal.second.getValueType())); 863 } 864 865 // Swift calling convention does not require we copy the sret argument 866 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. 867 868 // All x86 ABIs require that for returning structs by value we copy 869 // the sret argument into %rax/%eax (depending on ABI) for the return. 870 // We saved the argument into a virtual register in the entry block, 871 // so now we copy the value out and into %rax/%eax. 872 // 873 // Checking Function.hasStructRetAttr() here is insufficient because the IR 874 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is 875 // false, then an sret argument may be implicitly inserted in the SelDAG. In 876 // either case FuncInfo->setSRetReturnReg() will have been called. 877 if (Register SRetReg = FuncInfo->getSRetReturnReg()) { 878 // When we have both sret and another return value, we should use the 879 // original Chain stored in RetOps[0], instead of the current Chain updated 880 // in the above loop. If we only have sret, RetOps[0] equals to Chain. 881 882 // For the case of sret and another return value, we have 883 // Chain_0 at the function entry 884 // Chain_1 = getCopyToReg(Chain_0) in the above loop 885 // If we use Chain_1 in getCopyFromReg, we will have 886 // Val = getCopyFromReg(Chain_1) 887 // Chain_2 = getCopyToReg(Chain_1, Val) from below 888 889 // getCopyToReg(Chain_0) will be glued together with 890 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be 891 // in Unit B, and we will have cyclic dependency between Unit A and Unit B: 892 // Data dependency from Unit B to Unit A due to usage of Val in 893 // getCopyToReg(Chain_1, Val) 894 // Chain dependency from Unit A to Unit B 895 896 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. 897 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, 898 getPointerTy(MF.getDataLayout())); 899 900 Register RetValReg 901 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? 902 X86::RAX : X86::EAX; 903 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue); 904 Glue = Chain.getValue(1); 905 906 // RAX/EAX now acts like a return value. 907 RetOps.push_back( 908 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 909 910 // Add the returned register to the CalleeSaveDisableRegs list. Don't do 911 // this however for preserve_most/preserve_all to minimize the number of 912 // callee-saved registers for these CCs. 913 if (ShouldDisableCalleeSavedRegister && 914 CallConv != CallingConv::PreserveAll && 915 CallConv != CallingConv::PreserveMost) 916 MF.getRegInfo().disableCalleeSavedRegister(RetValReg); 917 } 918 919 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 920 const MCPhysReg *I = 921 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 922 if (I) { 923 for (; *I; ++I) { 924 if (X86::GR64RegClass.contains(*I)) 925 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 926 else 927 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 928 } 929 } 930 931 RetOps[0] = Chain; // Update chain. 932 933 // Add the glue if we have it. 934 if (Glue.getNode()) 935 RetOps.push_back(Glue); 936 937 X86ISD::NodeType opcode = X86ISD::RET_GLUE; 938 if (CallConv == CallingConv::X86_INTR) 939 opcode = X86ISD::IRET; 940 return DAG.getNode(opcode, dl, MVT::Other, RetOps); 941 } 942 943 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 944 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) 945 return false; 946 947 SDValue TCChain = Chain; 948 SDNode *Copy = *N->use_begin(); 949 if (Copy->getOpcode() == ISD::CopyToReg) { 950 // If the copy has a glue operand, we conservatively assume it isn't safe to 951 // perform a tail call. 952 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 953 return false; 954 TCChain = Copy->getOperand(0); 955 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 956 return false; 957 958 bool HasRet = false; 959 for (const SDNode *U : Copy->uses()) { 960 if (U->getOpcode() != X86ISD::RET_GLUE) 961 return false; 962 // If we are returning more than one value, we can definitely 963 // not make a tail call see PR19530 964 if (U->getNumOperands() > 4) 965 return false; 966 if (U->getNumOperands() == 4 && 967 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue) 968 return false; 969 HasRet = true; 970 } 971 972 if (!HasRet) 973 return false; 974 975 Chain = TCChain; 976 return true; 977 } 978 979 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 980 ISD::NodeType ExtendKind) const { 981 MVT ReturnMVT = MVT::i32; 982 983 bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); 984 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { 985 // The ABI does not require i1, i8 or i16 to be extended. 986 // 987 // On Darwin, there is code in the wild relying on Clang's old behaviour of 988 // always extending i8/i16 return values, so keep doing that for now. 989 // (PR26665). 990 ReturnMVT = MVT::i8; 991 } 992 993 EVT MinVT = getRegisterType(Context, ReturnMVT); 994 return VT.bitsLT(MinVT) ? MinVT : VT; 995 } 996 997 /// Reads two 32 bit registers and creates a 64 bit mask value. 998 /// \param VA The current 32 bit value that need to be assigned. 999 /// \param NextVA The next 32 bit value that need to be assigned. 1000 /// \param Root The parent DAG node. 1001 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for 1002 /// glue purposes. In the case the DAG is already using 1003 /// physical register instead of virtual, we should glue 1004 /// our new SDValue to InGlue SDvalue. 1005 /// \return a new SDvalue of size 64bit. 1006 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, 1007 SDValue &Root, SelectionDAG &DAG, 1008 const SDLoc &DL, const X86Subtarget &Subtarget, 1009 SDValue *InGlue = nullptr) { 1010 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); 1011 assert(Subtarget.is32Bit() && "Expecting 32 bit target"); 1012 assert(VA.getValVT() == MVT::v64i1 && 1013 "Expecting first location of 64 bit width type"); 1014 assert(NextVA.getValVT() == VA.getValVT() && 1015 "The locations should have the same type"); 1016 assert(VA.isRegLoc() && NextVA.isRegLoc() && 1017 "The values should reside in two registers"); 1018 1019 SDValue Lo, Hi; 1020 SDValue ArgValueLo, ArgValueHi; 1021 1022 MachineFunction &MF = DAG.getMachineFunction(); 1023 const TargetRegisterClass *RC = &X86::GR32RegClass; 1024 1025 // Read a 32 bit value from the registers. 1026 if (nullptr == InGlue) { 1027 // When no physical register is present, 1028 // create an intermediate virtual register. 1029 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 1030 ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32); 1031 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 1032 ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32); 1033 } else { 1034 // When a physical register is available read the value from it and glue 1035 // the reads together. 1036 ArgValueLo = 1037 DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue); 1038 *InGlue = ArgValueLo.getValue(2); 1039 ArgValueHi = 1040 DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue); 1041 *InGlue = ArgValueHi.getValue(2); 1042 } 1043 1044 // Convert the i32 type into v32i1 type. 1045 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); 1046 1047 // Convert the i32 type into v32i1 type. 1048 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); 1049 1050 // Concatenate the two values together. 1051 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi); 1052 } 1053 1054 /// The function will lower a register of various sizes (8/16/32/64) 1055 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) 1056 /// \returns a DAG node contains the operand after lowering to mask type. 1057 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, 1058 const EVT &ValLoc, const SDLoc &DL, 1059 SelectionDAG &DAG) { 1060 SDValue ValReturned = ValArg; 1061 1062 if (ValVT == MVT::v1i1) 1063 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned); 1064 1065 if (ValVT == MVT::v64i1) { 1066 // In 32 bit machine, this case is handled by getv64i1Argument 1067 assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); 1068 // In 64 bit machine, There is no need to truncate the value only bitcast 1069 } else { 1070 MVT MaskLenVT; 1071 switch (ValVT.getSimpleVT().SimpleTy) { 1072 case MVT::v8i1: 1073 MaskLenVT = MVT::i8; 1074 break; 1075 case MVT::v16i1: 1076 MaskLenVT = MVT::i16; 1077 break; 1078 case MVT::v32i1: 1079 MaskLenVT = MVT::i32; 1080 break; 1081 default: 1082 llvm_unreachable("Expecting a vector of i1 types"); 1083 } 1084 1085 ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned); 1086 } 1087 return DAG.getBitcast(ValVT, ValReturned); 1088 } 1089 1090 /// Lower the result values of a call into the 1091 /// appropriate copies out of appropriate physical registers. 1092 /// 1093 SDValue X86TargetLowering::LowerCallResult( 1094 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, 1095 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1096 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 1097 uint32_t *RegMask) const { 1098 1099 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 1100 // Assign locations to each value returned by this call. 1101 SmallVector<CCValAssign, 16> RVLocs; 1102 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1103 *DAG.getContext()); 1104 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1105 1106 // Copy all of the result registers out of their specified physreg. 1107 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; 1108 ++I, ++InsIndex) { 1109 CCValAssign &VA = RVLocs[I]; 1110 EVT CopyVT = VA.getLocVT(); 1111 1112 // In some calling conventions we need to remove the used registers 1113 // from the register mask. 1114 if (RegMask) { 1115 for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg())) 1116 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); 1117 } 1118 1119 // Report an error if there was an attempt to return FP values via XMM 1120 // registers. 1121 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { 1122 errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); 1123 if (VA.getLocReg() == X86::XMM1) 1124 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. 1125 else 1126 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 1127 } else if (!Subtarget.hasSSE2() && 1128 X86::FR64XRegClass.contains(VA.getLocReg()) && 1129 CopyVT == MVT::f64) { 1130 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); 1131 if (VA.getLocReg() == X86::XMM1) 1132 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. 1133 else 1134 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 1135 } 1136 1137 // If we prefer to use the value in xmm registers, copy it out as f80 and 1138 // use a truncate to move it from fp stack reg to xmm reg. 1139 bool RoundAfterCopy = false; 1140 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && 1141 isScalarFPTypeInSSEReg(VA.getValVT())) { 1142 if (!Subtarget.hasX87()) 1143 report_fatal_error("X87 register return with X87 disabled"); 1144 CopyVT = MVT::f80; 1145 RoundAfterCopy = (CopyVT != VA.getLocVT()); 1146 } 1147 1148 SDValue Val; 1149 if (VA.needsCustom()) { 1150 assert(VA.getValVT() == MVT::v64i1 && 1151 "Currently the only custom case is when we split v64i1 to 2 regs"); 1152 Val = 1153 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue); 1154 } else { 1155 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue) 1156 .getValue(1); 1157 Val = Chain.getValue(0); 1158 InGlue = Chain.getValue(2); 1159 } 1160 1161 if (RoundAfterCopy) 1162 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1163 // This truncation won't change the value. 1164 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true)); 1165 1166 if (VA.isExtInLoc()) { 1167 if (VA.getValVT().isVector() && 1168 VA.getValVT().getScalarType() == MVT::i1 && 1169 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || 1170 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { 1171 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 1172 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); 1173 } else 1174 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 1175 } 1176 1177 if (VA.getLocInfo() == CCValAssign::BCvt) 1178 Val = DAG.getBitcast(VA.getValVT(), Val); 1179 1180 InVals.push_back(Val); 1181 } 1182 1183 return Chain; 1184 } 1185 1186 //===----------------------------------------------------------------------===// 1187 // C & StdCall & Fast Calling Convention implementation 1188 //===----------------------------------------------------------------------===// 1189 // StdCall calling convention seems to be standard for many Windows' API 1190 // routines and around. It differs from C calling convention just a little: 1191 // callee should clean up the stack, not caller. Symbols should be also 1192 // decorated in some fancy way :) It doesn't support any vector arguments. 1193 // For info on fast calling convention see Fast Calling Convention (tail call) 1194 // implementation LowerX86_32FastCCCallTo. 1195 1196 /// Determines whether Args, either a set of outgoing arguments to a call, or a 1197 /// set of incoming args of a call, contains an sret pointer that the callee 1198 /// pops 1199 template <typename T> 1200 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args, 1201 const X86Subtarget &Subtarget) { 1202 // Not C++20 (yet), so no concepts available. 1203 static_assert(std::is_same_v<T, ISD::OutputArg> || 1204 std::is_same_v<T, ISD::InputArg>, 1205 "requires ISD::OutputArg or ISD::InputArg"); 1206 1207 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out 1208 // for most compilations. 1209 if (!Subtarget.is32Bit()) 1210 return false; 1211 1212 if (Args.empty()) 1213 return false; 1214 1215 // Most calls do not have an sret argument, check the arg next. 1216 const ISD::ArgFlagsTy &Flags = Args[0].Flags; 1217 if (!Flags.isSRet() || Flags.isInReg()) 1218 return false; 1219 1220 // The MSVCabi does not pop the sret. 1221 if (Subtarget.getTargetTriple().isOSMSVCRT()) 1222 return false; 1223 1224 // MCUs don't pop the sret 1225 if (Subtarget.isTargetMCU()) 1226 return false; 1227 1228 // Callee pops argument 1229 return true; 1230 } 1231 1232 /// Make a copy of an aggregate at address specified by "Src" to address 1233 /// "Dst" with size and alignment information specified by the specific 1234 /// parameter attribute. The copy will be passed as a byval function parameter. 1235 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 1236 SDValue Chain, ISD::ArgFlagsTy Flags, 1237 SelectionDAG &DAG, const SDLoc &dl) { 1238 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); 1239 1240 return DAG.getMemcpy( 1241 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), 1242 /*isVolatile*/ false, /*AlwaysInline=*/true, 1243 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo()); 1244 } 1245 1246 /// Return true if the calling convention is one that we can guarantee TCO for. 1247 static bool canGuaranteeTCO(CallingConv::ID CC) { 1248 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 1249 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || 1250 CC == CallingConv::Tail || CC == CallingConv::SwiftTail); 1251 } 1252 1253 /// Return true if we might ever do TCO for calls with this calling convention. 1254 static bool mayTailCallThisCC(CallingConv::ID CC) { 1255 switch (CC) { 1256 // C calling conventions: 1257 case CallingConv::C: 1258 case CallingConv::Win64: 1259 case CallingConv::X86_64_SysV: 1260 // Callee pop conventions: 1261 case CallingConv::X86_ThisCall: 1262 case CallingConv::X86_StdCall: 1263 case CallingConv::X86_VectorCall: 1264 case CallingConv::X86_FastCall: 1265 // Swift: 1266 case CallingConv::Swift: 1267 return true; 1268 default: 1269 return canGuaranteeTCO(CC); 1270 } 1271 } 1272 1273 /// Return true if the function is being made into a tailcall target by 1274 /// changing its ABI. 1275 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { 1276 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || 1277 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 1278 } 1279 1280 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 1281 if (!CI->isTailCall()) 1282 return false; 1283 1284 CallingConv::ID CalleeCC = CI->getCallingConv(); 1285 if (!mayTailCallThisCC(CalleeCC)) 1286 return false; 1287 1288 return true; 1289 } 1290 1291 SDValue 1292 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, 1293 const SmallVectorImpl<ISD::InputArg> &Ins, 1294 const SDLoc &dl, SelectionDAG &DAG, 1295 const CCValAssign &VA, 1296 MachineFrameInfo &MFI, unsigned i) const { 1297 // Create the nodes corresponding to a load from this parameter slot. 1298 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1299 bool AlwaysUseMutable = shouldGuaranteeTCO( 1300 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); 1301 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1302 EVT ValVT; 1303 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 1304 1305 // If value is passed by pointer we have address passed instead of the value 1306 // itself. No need to extend if the mask value and location share the same 1307 // absolute size. 1308 bool ExtendedInMem = 1309 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && 1310 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); 1311 1312 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) 1313 ValVT = VA.getLocVT(); 1314 else 1315 ValVT = VA.getValVT(); 1316 1317 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1318 // changed with more analysis. 1319 // In case of tail call optimization mark all arguments mutable. Since they 1320 // could be overwritten by lowering of arguments in case of a tail call. 1321 if (Flags.isByVal()) { 1322 unsigned Bytes = Flags.getByValSize(); 1323 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1324 1325 // FIXME: For now, all byval parameter objects are marked as aliasing. This 1326 // can be improved with deeper analysis. 1327 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, 1328 /*isAliased=*/true); 1329 return DAG.getFrameIndex(FI, PtrVT); 1330 } 1331 1332 EVT ArgVT = Ins[i].ArgVT; 1333 1334 // If this is a vector that has been split into multiple parts, don't elide 1335 // the copy. The layout on the stack may not match the packed in-memory 1336 // layout. 1337 bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector(); 1338 1339 // This is an argument in memory. We might be able to perform copy elision. 1340 // If the argument is passed directly in memory without any extension, then we 1341 // can perform copy elision. Large vector types, for example, may be passed 1342 // indirectly by pointer. 1343 if (Flags.isCopyElisionCandidate() && 1344 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem && 1345 !ScalarizedVector) { 1346 SDValue PartAddr; 1347 if (Ins[i].PartOffset == 0) { 1348 // If this is a one-part value or the first part of a multi-part value, 1349 // create a stack object for the entire argument value type and return a 1350 // load from our portion of it. This assumes that if the first part of an 1351 // argument is in memory, the rest will also be in memory. 1352 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), 1353 /*IsImmutable=*/false); 1354 PartAddr = DAG.getFrameIndex(FI, PtrVT); 1355 return DAG.getLoad( 1356 ValVT, dl, Chain, PartAddr, 1357 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 1358 } 1359 1360 // This is not the first piece of an argument in memory. See if there is 1361 // already a fixed stack object including this offset. If so, assume it 1362 // was created by the PartOffset == 0 branch above and create a load from 1363 // the appropriate offset into it. 1364 int64_t PartBegin = VA.getLocMemOffset(); 1365 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; 1366 int FI = MFI.getObjectIndexBegin(); 1367 for (; MFI.isFixedObjectIndex(FI); ++FI) { 1368 int64_t ObjBegin = MFI.getObjectOffset(FI); 1369 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); 1370 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) 1371 break; 1372 } 1373 if (MFI.isFixedObjectIndex(FI)) { 1374 SDValue Addr = 1375 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), 1376 DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); 1377 return DAG.getLoad(ValVT, dl, Chain, Addr, 1378 MachinePointerInfo::getFixedStack( 1379 DAG.getMachineFunction(), FI, Ins[i].PartOffset)); 1380 } 1381 } 1382 1383 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, 1384 VA.getLocMemOffset(), isImmutable); 1385 1386 // Set SExt or ZExt flag. 1387 if (VA.getLocInfo() == CCValAssign::ZExt) { 1388 MFI.setObjectZExt(FI, true); 1389 } else if (VA.getLocInfo() == CCValAssign::SExt) { 1390 MFI.setObjectSExt(FI, true); 1391 } 1392 1393 MaybeAlign Alignment; 1394 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && 1395 ValVT != MVT::f80) 1396 Alignment = MaybeAlign(4); 1397 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 1398 SDValue Val = DAG.getLoad( 1399 ValVT, dl, Chain, FIN, 1400 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 1401 Alignment); 1402 return ExtendedInMem 1403 ? (VA.getValVT().isVector() 1404 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) 1405 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) 1406 : Val; 1407 } 1408 1409 // FIXME: Get this from tablegen. 1410 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, 1411 const X86Subtarget &Subtarget) { 1412 assert(Subtarget.is64Bit()); 1413 1414 if (Subtarget.isCallingConvWin64(CallConv)) { 1415 static const MCPhysReg GPR64ArgRegsWin64[] = { 1416 X86::RCX, X86::RDX, X86::R8, X86::R9 1417 }; 1418 return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); 1419 } 1420 1421 static const MCPhysReg GPR64ArgRegs64Bit[] = { 1422 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1423 }; 1424 return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); 1425 } 1426 1427 // FIXME: Get this from tablegen. 1428 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, 1429 CallingConv::ID CallConv, 1430 const X86Subtarget &Subtarget) { 1431 assert(Subtarget.is64Bit()); 1432 if (Subtarget.isCallingConvWin64(CallConv)) { 1433 // The XMM registers which might contain var arg parameters are shadowed 1434 // in their paired GPR. So we only need to save the GPR to their home 1435 // slots. 1436 // TODO: __vectorcall will change this. 1437 return std::nullopt; 1438 } 1439 1440 bool isSoftFloat = Subtarget.useSoftFloat(); 1441 if (isSoftFloat || !Subtarget.hasSSE1()) 1442 // Kernel mode asks for SSE to be disabled, so there are no XMM argument 1443 // registers. 1444 return std::nullopt; 1445 1446 static const MCPhysReg XMMArgRegs64Bit[] = { 1447 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1448 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1449 }; 1450 return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); 1451 } 1452 1453 #ifndef NDEBUG 1454 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) { 1455 return llvm::is_sorted( 1456 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool { 1457 return A.getValNo() < B.getValNo(); 1458 }); 1459 } 1460 #endif 1461 1462 namespace { 1463 /// This is a helper class for lowering variable arguments parameters. 1464 class VarArgsLoweringHelper { 1465 public: 1466 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc, 1467 SelectionDAG &DAG, const X86Subtarget &Subtarget, 1468 CallingConv::ID CallConv, CCState &CCInfo) 1469 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget), 1470 TheMachineFunction(DAG.getMachineFunction()), 1471 TheFunction(TheMachineFunction.getFunction()), 1472 FrameInfo(TheMachineFunction.getFrameInfo()), 1473 FrameLowering(*Subtarget.getFrameLowering()), 1474 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv), 1475 CCInfo(CCInfo) {} 1476 1477 // Lower variable arguments parameters. 1478 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize); 1479 1480 private: 1481 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize); 1482 1483 void forwardMustTailParameters(SDValue &Chain); 1484 1485 bool is64Bit() const { return Subtarget.is64Bit(); } 1486 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } 1487 1488 X86MachineFunctionInfo *FuncInfo; 1489 const SDLoc &DL; 1490 SelectionDAG &DAG; 1491 const X86Subtarget &Subtarget; 1492 MachineFunction &TheMachineFunction; 1493 const Function &TheFunction; 1494 MachineFrameInfo &FrameInfo; 1495 const TargetFrameLowering &FrameLowering; 1496 const TargetLowering &TargLowering; 1497 CallingConv::ID CallConv; 1498 CCState &CCInfo; 1499 }; 1500 } // namespace 1501 1502 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( 1503 SDValue &Chain, unsigned StackSize) { 1504 // If the function takes variable number of arguments, make a frame index for 1505 // the start of the first vararg value... for expansion of llvm.va_start. We 1506 // can skip this if there are no va_start calls. 1507 if (is64Bit() || (CallConv != CallingConv::X86_FastCall && 1508 CallConv != CallingConv::X86_ThisCall)) { 1509 FuncInfo->setVarArgsFrameIndex( 1510 FrameInfo.CreateFixedObject(1, StackSize, true)); 1511 } 1512 1513 // 64-bit calling conventions support varargs and register parameters, so we 1514 // have to do extra work to spill them in the prologue. 1515 if (is64Bit()) { 1516 // Find the first unallocated argument registers. 1517 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); 1518 ArrayRef<MCPhysReg> ArgXMMs = 1519 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget); 1520 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); 1521 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); 1522 1523 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && 1524 "SSE register cannot be used when SSE is disabled!"); 1525 1526 if (isWin64()) { 1527 // Get to the caller-allocated home save location. Add 8 to account 1528 // for the return address. 1529 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8; 1530 FuncInfo->setRegSaveFrameIndex( 1531 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1532 // Fixup to set vararg frame on shadow area (4 x i64). 1533 if (NumIntRegs < 4) 1534 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1535 } else { 1536 // For X86-64, if there are vararg parameters that are passed via 1537 // registers, then we must store them to their spots on the stack so 1538 // they may be loaded by dereferencing the result of va_next. 1539 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1540 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); 1541 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject( 1542 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false)); 1543 } 1544 1545 SmallVector<SDValue, 6> 1546 LiveGPRs; // list of SDValue for GPR registers keeping live input value 1547 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers 1548 // keeping live input value 1549 SDValue ALVal; // if applicable keeps SDValue for %al register 1550 1551 // Gather all the live in physical registers. 1552 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { 1553 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass); 1554 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64)); 1555 } 1556 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs); 1557 if (!AvailableXmms.empty()) { 1558 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); 1559 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8); 1560 for (MCPhysReg Reg : AvailableXmms) { 1561 // FastRegisterAllocator spills virtual registers at basic 1562 // block boundary. That leads to usages of xmm registers 1563 // outside of check for %al. Pass physical registers to 1564 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. 1565 TheMachineFunction.getRegInfo().addLiveIn(Reg); 1566 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); 1567 } 1568 } 1569 1570 // Store the integer parameter registers. 1571 SmallVector<SDValue, 8> MemOps; 1572 SDValue RSFIN = 1573 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1574 TargLowering.getPointerTy(DAG.getDataLayout())); 1575 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1576 for (SDValue Val : LiveGPRs) { 1577 SDValue FIN = DAG.getNode(ISD::ADD, DL, 1578 TargLowering.getPointerTy(DAG.getDataLayout()), 1579 RSFIN, DAG.getIntPtrConstant(Offset, DL)); 1580 SDValue Store = 1581 DAG.getStore(Val.getValue(1), DL, Val, FIN, 1582 MachinePointerInfo::getFixedStack( 1583 DAG.getMachineFunction(), 1584 FuncInfo->getRegSaveFrameIndex(), Offset)); 1585 MemOps.push_back(Store); 1586 Offset += 8; 1587 } 1588 1589 // Now store the XMM (fp + vector) parameter registers. 1590 if (!LiveXMMRegs.empty()) { 1591 SmallVector<SDValue, 12> SaveXMMOps; 1592 SaveXMMOps.push_back(Chain); 1593 SaveXMMOps.push_back(ALVal); 1594 SaveXMMOps.push_back(RSFIN); 1595 SaveXMMOps.push_back( 1596 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); 1597 llvm::append_range(SaveXMMOps, LiveXMMRegs); 1598 MachineMemOperand *StoreMMO = 1599 DAG.getMachineFunction().getMachineMemOperand( 1600 MachinePointerInfo::getFixedStack( 1601 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), 1602 Offset), 1603 MachineMemOperand::MOStore, 128, Align(16)); 1604 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS, 1605 DL, DAG.getVTList(MVT::Other), 1606 SaveXMMOps, MVT::i8, StoreMMO)); 1607 } 1608 1609 if (!MemOps.empty()) 1610 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 1611 } 1612 } 1613 1614 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { 1615 // Find the largest legal vector type. 1616 MVT VecVT = MVT::Other; 1617 // FIXME: Only some x86_32 calling conventions support AVX512. 1618 if (Subtarget.useAVX512Regs() && 1619 (is64Bit() || (CallConv == CallingConv::X86_VectorCall || 1620 CallConv == CallingConv::Intel_OCL_BI))) 1621 VecVT = MVT::v16f32; 1622 else if (Subtarget.hasAVX()) 1623 VecVT = MVT::v8f32; 1624 else if (Subtarget.hasSSE2()) 1625 VecVT = MVT::v4f32; 1626 1627 // We forward some GPRs and some vector types. 1628 SmallVector<MVT, 2> RegParmTypes; 1629 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32; 1630 RegParmTypes.push_back(IntVT); 1631 if (VecVT != MVT::Other) 1632 RegParmTypes.push_back(VecVT); 1633 1634 // Compute the set of forwarded registers. The rest are scratch. 1635 SmallVectorImpl<ForwardedRegister> &Forwards = 1636 FuncInfo->getForwardedMustTailRegParms(); 1637 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); 1638 1639 // Forward AL for SysV x86_64 targets, since it is used for varargs. 1640 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) { 1641 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); 1642 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); 1643 } 1644 1645 // Copy all forwards from physical to virtual registers. 1646 for (ForwardedRegister &FR : Forwards) { 1647 // FIXME: Can we use a less constrained schedule? 1648 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT); 1649 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister( 1650 TargLowering.getRegClassFor(FR.VT)); 1651 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal); 1652 } 1653 } 1654 1655 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain, 1656 unsigned StackSize) { 1657 // Set FrameIndex to the 0xAAAAAAA value to mark unset state. 1658 // If necessary, it would be set into the correct value later. 1659 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1660 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1661 1662 if (FrameInfo.hasVAStart()) 1663 createVarArgAreaAndStoreRegisters(Chain, StackSize); 1664 1665 if (FrameInfo.hasMustTailInVarArgFunc()) 1666 forwardMustTailParameters(Chain); 1667 } 1668 1669 SDValue X86TargetLowering::LowerFormalArguments( 1670 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, 1671 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1672 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1673 MachineFunction &MF = DAG.getMachineFunction(); 1674 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1675 1676 const Function &F = MF.getFunction(); 1677 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && 1678 F.getName() == "main") 1679 FuncInfo->setForceFramePointer(true); 1680 1681 MachineFrameInfo &MFI = MF.getFrameInfo(); 1682 bool Is64Bit = Subtarget.is64Bit(); 1683 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 1684 1685 assert( 1686 !(IsVarArg && canGuaranteeTCO(CallConv)) && 1687 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); 1688 1689 // Assign locations to all of the incoming arguments. 1690 SmallVector<CCValAssign, 16> ArgLocs; 1691 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 1692 1693 // Allocate shadow area for Win64. 1694 if (IsWin64) 1695 CCInfo.AllocateStack(32, Align(8)); 1696 1697 CCInfo.AnalyzeArguments(Ins, CC_X86); 1698 1699 // In vectorcall calling convention a second pass is required for the HVA 1700 // types. 1701 if (CallingConv::X86_VectorCall == CallConv) { 1702 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); 1703 } 1704 1705 // The next loop assumes that the locations are in the same order of the 1706 // input arguments. 1707 assert(isSortedByValueNo(ArgLocs) && 1708 "Argument Location list must be sorted before lowering"); 1709 1710 SDValue ArgValue; 1711 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; 1712 ++I, ++InsIndex) { 1713 assert(InsIndex < Ins.size() && "Invalid Ins index"); 1714 CCValAssign &VA = ArgLocs[I]; 1715 1716 if (VA.isRegLoc()) { 1717 EVT RegVT = VA.getLocVT(); 1718 if (VA.needsCustom()) { 1719 assert( 1720 VA.getValVT() == MVT::v64i1 && 1721 "Currently the only custom case is when we split v64i1 to 2 regs"); 1722 1723 // v64i1 values, in regcall calling convention, that are 1724 // compiled to 32 bit arch, are split up into two registers. 1725 ArgValue = 1726 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); 1727 } else { 1728 const TargetRegisterClass *RC; 1729 if (RegVT == MVT::i8) 1730 RC = &X86::GR8RegClass; 1731 else if (RegVT == MVT::i16) 1732 RC = &X86::GR16RegClass; 1733 else if (RegVT == MVT::i32) 1734 RC = &X86::GR32RegClass; 1735 else if (Is64Bit && RegVT == MVT::i64) 1736 RC = &X86::GR64RegClass; 1737 else if (RegVT == MVT::f16) 1738 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; 1739 else if (RegVT == MVT::f32) 1740 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; 1741 else if (RegVT == MVT::f64) 1742 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; 1743 else if (RegVT == MVT::f80) 1744 RC = &X86::RFP80RegClass; 1745 else if (RegVT == MVT::f128) 1746 RC = &X86::VR128RegClass; 1747 else if (RegVT.is512BitVector()) 1748 RC = &X86::VR512RegClass; 1749 else if (RegVT.is256BitVector()) 1750 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; 1751 else if (RegVT.is128BitVector()) 1752 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; 1753 else if (RegVT == MVT::x86mmx) 1754 RC = &X86::VR64RegClass; 1755 else if (RegVT == MVT::v1i1) 1756 RC = &X86::VK1RegClass; 1757 else if (RegVT == MVT::v8i1) 1758 RC = &X86::VK8RegClass; 1759 else if (RegVT == MVT::v16i1) 1760 RC = &X86::VK16RegClass; 1761 else if (RegVT == MVT::v32i1) 1762 RC = &X86::VK32RegClass; 1763 else if (RegVT == MVT::v64i1) 1764 RC = &X86::VK64RegClass; 1765 else 1766 llvm_unreachable("Unknown argument type!"); 1767 1768 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 1769 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1770 } 1771 1772 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1773 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1774 // right size. 1775 if (VA.getLocInfo() == CCValAssign::SExt) 1776 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1777 DAG.getValueType(VA.getValVT())); 1778 else if (VA.getLocInfo() == CCValAssign::ZExt) 1779 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1780 DAG.getValueType(VA.getValVT())); 1781 else if (VA.getLocInfo() == CCValAssign::BCvt) 1782 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); 1783 1784 if (VA.isExtInLoc()) { 1785 // Handle MMX values passed in XMM regs. 1786 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) 1787 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 1788 else if (VA.getValVT().isVector() && 1789 VA.getValVT().getScalarType() == MVT::i1 && 1790 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || 1791 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { 1792 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 1793 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); 1794 } else 1795 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1796 } 1797 } else { 1798 assert(VA.isMemLoc()); 1799 ArgValue = 1800 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); 1801 } 1802 1803 // If value is passed via pointer - do a load. 1804 if (VA.getLocInfo() == CCValAssign::Indirect && 1805 !(Ins[I].Flags.isByVal() && VA.isRegLoc())) { 1806 ArgValue = 1807 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); 1808 } 1809 1810 InVals.push_back(ArgValue); 1811 } 1812 1813 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 1814 if (Ins[I].Flags.isSwiftAsync()) { 1815 auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); 1816 if (Subtarget.is64Bit()) 1817 X86FI->setHasSwiftAsyncContext(true); 1818 else { 1819 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false); 1820 X86FI->setSwiftAsyncContextFrameIdx(FI); 1821 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I], 1822 DAG.getFrameIndex(FI, MVT::i32), 1823 MachinePointerInfo::getFixedStack(MF, FI)); 1824 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain); 1825 } 1826 } 1827 1828 // Swift calling convention does not require we copy the sret argument 1829 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. 1830 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail) 1831 continue; 1832 1833 // All x86 ABIs require that for returning structs by value we copy the 1834 // sret argument into %rax/%eax (depending on ABI) for the return. Save 1835 // the argument into a virtual register so that we can access it from the 1836 // return points. 1837 if (Ins[I].Flags.isSRet()) { 1838 assert(!FuncInfo->getSRetReturnReg() && 1839 "SRet return has already been set"); 1840 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 1841 Register Reg = 1842 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 1843 FuncInfo->setSRetReturnReg(Reg); 1844 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); 1845 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1846 break; 1847 } 1848 } 1849 1850 unsigned StackSize = CCInfo.getStackSize(); 1851 // Align stack specially for tail calls. 1852 if (shouldGuaranteeTCO(CallConv, 1853 MF.getTarget().Options.GuaranteedTailCallOpt)) 1854 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1855 1856 if (IsVarArg) 1857 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo) 1858 .lowerVarArgsParameters(Chain, StackSize); 1859 1860 // Some CCs need callee pop. 1861 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg, 1862 MF.getTarget().Options.GuaranteedTailCallOpt)) { 1863 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1864 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { 1865 // X86 interrupts must pop the error code (and the alignment padding) if 1866 // present. 1867 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); 1868 } else { 1869 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1870 // If this is an sret function, the return should pop the hidden pointer. 1871 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget)) 1872 FuncInfo->setBytesToPopOnReturn(4); 1873 } 1874 1875 if (!Is64Bit) { 1876 // RegSaveFrameIndex is X86-64 only. 1877 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1878 } 1879 1880 FuncInfo->setArgumentStackSize(StackSize); 1881 1882 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { 1883 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); 1884 if (Personality == EHPersonality::CoreCLR) { 1885 assert(Is64Bit); 1886 // TODO: Add a mechanism to frame lowering that will allow us to indicate 1887 // that we'd prefer this slot be allocated towards the bottom of the frame 1888 // (i.e. near the stack pointer after allocating the frame). Every 1889 // funclet needs a copy of this slot in its (mostly empty) frame, and the 1890 // offset from the bottom of this and each funclet's frame must be the 1891 // same, so the size of funclets' (mostly empty) frames is dictated by 1892 // how far this slot is from the bottom (since they allocate just enough 1893 // space to accommodate holding this slot at the correct offset). 1894 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); 1895 EHInfo->PSPSymFrameIdx = PSPSymFI; 1896 } 1897 } 1898 1899 if (shouldDisableArgRegFromCSR(CallConv) || 1900 F.hasFnAttribute("no_caller_saved_registers")) { 1901 MachineRegisterInfo &MRI = MF.getRegInfo(); 1902 for (std::pair<Register, Register> Pair : MRI.liveins()) 1903 MRI.disableCalleeSavedRegister(Pair.first); 1904 } 1905 1906 return Chain; 1907 } 1908 1909 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1910 SDValue Arg, const SDLoc &dl, 1911 SelectionDAG &DAG, 1912 const CCValAssign &VA, 1913 ISD::ArgFlagsTy Flags, 1914 bool isByVal) const { 1915 unsigned LocMemOffset = VA.getLocMemOffset(); 1916 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1917 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1918 StackPtr, PtrOff); 1919 if (isByVal) 1920 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1921 1922 MaybeAlign Alignment; 1923 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && 1924 Arg.getSimpleValueType() != MVT::f80) 1925 Alignment = MaybeAlign(4); 1926 return DAG.getStore( 1927 Chain, dl, Arg, PtrOff, 1928 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), 1929 Alignment); 1930 } 1931 1932 /// Emit a load of return address if tail call 1933 /// optimization is performed and it is required. 1934 SDValue X86TargetLowering::EmitTailCallLoadRetAddr( 1935 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, 1936 bool Is64Bit, int FPDiff, const SDLoc &dl) const { 1937 // Adjust the Return address stack slot. 1938 EVT VT = getPointerTy(DAG.getDataLayout()); 1939 OutRetAddr = getReturnAddressFrameIndex(DAG); 1940 1941 // Load the "old" Return address. 1942 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); 1943 return SDValue(OutRetAddr.getNode(), 1); 1944 } 1945 1946 /// Emit a store of the return address if tail call 1947 /// optimization is performed and it is required (FPDiff!=0). 1948 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, 1949 SDValue Chain, SDValue RetAddrFrIdx, 1950 EVT PtrVT, unsigned SlotSize, 1951 int FPDiff, const SDLoc &dl) { 1952 // Store the return address to the appropriate stack slot. 1953 if (!FPDiff) return Chain; 1954 // Calculate the new stack slot for the return address. 1955 int NewReturnAddrFI = 1956 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 1957 false); 1958 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 1959 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1960 MachinePointerInfo::getFixedStack( 1961 DAG.getMachineFunction(), NewReturnAddrFI)); 1962 return Chain; 1963 } 1964 1965 /// Returns a vector_shuffle mask for an movs{s|d}, movd 1966 /// operation of specified width. 1967 SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, 1968 SDValue V1, SDValue V2) const { 1969 unsigned NumElems = VT.getVectorNumElements(); 1970 SmallVector<int, 8> Mask; 1971 Mask.push_back(NumElems); 1972 for (unsigned i = 1; i != NumElems; ++i) 1973 Mask.push_back(i); 1974 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); 1975 } 1976 1977 SDValue 1978 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1979 SmallVectorImpl<SDValue> &InVals) const { 1980 SelectionDAG &DAG = CLI.DAG; 1981 SDLoc &dl = CLI.DL; 1982 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1983 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1984 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1985 SDValue Chain = CLI.Chain; 1986 SDValue Callee = CLI.Callee; 1987 CallingConv::ID CallConv = CLI.CallConv; 1988 bool &isTailCall = CLI.IsTailCall; 1989 bool isVarArg = CLI.IsVarArg; 1990 const auto *CB = CLI.CB; 1991 1992 MachineFunction &MF = DAG.getMachineFunction(); 1993 bool Is64Bit = Subtarget.is64Bit(); 1994 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 1995 bool IsSibcall = false; 1996 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || 1997 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; 1998 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget); 1999 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2000 bool HasNCSR = (CB && isa<CallInst>(CB) && 2001 CB->hasFnAttr("no_caller_saved_registers")); 2002 bool HasNoCfCheck = (CB && CB->doesNoCfCheck()); 2003 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall()); 2004 bool IsCFICall = IsIndirectCall && CLI.CFIType; 2005 const Module *M = MF.getMMI().getModule(); 2006 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); 2007 2008 MachineFunction::CallSiteInfo CSInfo; 2009 if (CallConv == CallingConv::X86_INTR) 2010 report_fatal_error("X86 interrupts may not be called directly"); 2011 2012 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); 2013 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) { 2014 // If we are using a GOT, disable tail calls to external symbols with 2015 // default visibility. Tail calling such a symbol requires using a GOT 2016 // relocation, which forces early binding of the symbol. This breaks code 2017 // that require lazy function symbol resolution. Using musttail or 2018 // GuaranteedTailCallOpt will override this. 2019 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2020 if (!G || (!G->getGlobal()->hasLocalLinkage() && 2021 G->getGlobal()->hasDefaultVisibility())) 2022 isTailCall = false; 2023 } 2024 2025 if (isTailCall && !IsMustTail) { 2026 // Check if it's really possible to do a tail call. 2027 isTailCall = IsEligibleForTailCallOptimization( 2028 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals, 2029 Ins, DAG); 2030 2031 // Sibcalls are automatically detected tailcalls which do not require 2032 // ABI changes. 2033 if (!IsGuaranteeTCO && isTailCall) 2034 IsSibcall = true; 2035 2036 if (isTailCall) 2037 ++NumTailCalls; 2038 } 2039 2040 if (IsMustTail && !isTailCall) 2041 report_fatal_error("failed to perform tail call elimination on a call " 2042 "site marked musttail"); 2043 2044 assert(!(isVarArg && canGuaranteeTCO(CallConv)) && 2045 "Var args not supported with calling convention fastcc, ghc or hipe"); 2046 2047 // Analyze operands of the call, assigning locations to each operand. 2048 SmallVector<CCValAssign, 16> ArgLocs; 2049 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 2050 2051 // Allocate shadow area for Win64. 2052 if (IsWin64) 2053 CCInfo.AllocateStack(32, Align(8)); 2054 2055 CCInfo.AnalyzeArguments(Outs, CC_X86); 2056 2057 // In vectorcall calling convention a second pass is required for the HVA 2058 // types. 2059 if (CallingConv::X86_VectorCall == CallConv) { 2060 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); 2061 } 2062 2063 // Get a count of how many bytes are to be pushed on the stack. 2064 unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); 2065 if (IsSibcall) 2066 // This is a sibcall. The memory operands are available in caller's 2067 // own caller's stack. 2068 NumBytes = 0; 2069 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) 2070 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2071 2072 int FPDiff = 0; 2073 if (isTailCall && 2074 shouldGuaranteeTCO(CallConv, 2075 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2076 // Lower arguments at fp - stackoffset + fpdiff. 2077 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2078 2079 FPDiff = NumBytesCallerPushed - NumBytes; 2080 2081 // Set the delta of movement of the returnaddr stackslot. 2082 // But only set if delta is greater than previous delta. 2083 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2084 X86Info->setTCReturnAddrDelta(FPDiff); 2085 } 2086 2087 unsigned NumBytesToPush = NumBytes; 2088 unsigned NumBytesToPop = NumBytes; 2089 2090 // If we have an inalloca argument, all stack space has already been allocated 2091 // for us and be right at the top of the stack. We don't support multiple 2092 // arguments passed in memory when using inalloca. 2093 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { 2094 NumBytesToPush = 0; 2095 if (!ArgLocs.back().isMemLoc()) 2096 report_fatal_error("cannot use inalloca attribute on a register " 2097 "parameter"); 2098 if (ArgLocs.back().getLocMemOffset() != 0) 2099 report_fatal_error("any parameter with the inalloca attribute must be " 2100 "the only memory argument"); 2101 } else if (CLI.IsPreallocated) { 2102 assert(ArgLocs.back().isMemLoc() && 2103 "cannot use preallocated attribute on a register " 2104 "parameter"); 2105 SmallVector<size_t, 4> PreallocatedOffsets; 2106 for (size_t i = 0; i < CLI.OutVals.size(); ++i) { 2107 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { 2108 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); 2109 } 2110 } 2111 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); 2112 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB); 2113 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes); 2114 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); 2115 NumBytesToPush = 0; 2116 } 2117 2118 if (!IsSibcall && !IsMustTail) 2119 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, 2120 NumBytes - NumBytesToPush, dl); 2121 2122 SDValue RetAddrFrIdx; 2123 // Load return address for tail calls. 2124 if (isTailCall && FPDiff) 2125 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2126 Is64Bit, FPDiff, dl); 2127 2128 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; 2129 SmallVector<SDValue, 8> MemOpChains; 2130 SDValue StackPtr; 2131 2132 // The next loop assumes that the locations are in the same order of the 2133 // input arguments. 2134 assert(isSortedByValueNo(ArgLocs) && 2135 "Argument Location list must be sorted before lowering"); 2136 2137 // Walk the register/memloc assignments, inserting copies/loads. In the case 2138 // of tail call optimization arguments are handle later. 2139 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 2140 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; 2141 ++I, ++OutIndex) { 2142 assert(OutIndex < Outs.size() && "Invalid Out index"); 2143 // Skip inalloca/preallocated arguments, they have already been written. 2144 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; 2145 if (Flags.isInAlloca() || Flags.isPreallocated()) 2146 continue; 2147 2148 CCValAssign &VA = ArgLocs[I]; 2149 EVT RegVT = VA.getLocVT(); 2150 SDValue Arg = OutVals[OutIndex]; 2151 bool isByVal = Flags.isByVal(); 2152 2153 // Promote the value if needed. 2154 switch (VA.getLocInfo()) { 2155 default: llvm_unreachable("Unknown loc info!"); 2156 case CCValAssign::Full: break; 2157 case CCValAssign::SExt: 2158 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2159 break; 2160 case CCValAssign::ZExt: 2161 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2162 break; 2163 case CCValAssign::AExt: 2164 if (Arg.getValueType().isVector() && 2165 Arg.getValueType().getVectorElementType() == MVT::i1) 2166 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); 2167 else if (RegVT.is128BitVector()) { 2168 // Special case: passing MMX values in XMM registers. 2169 Arg = DAG.getBitcast(MVT::i64, Arg); 2170 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2171 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2172 } else 2173 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2174 break; 2175 case CCValAssign::BCvt: 2176 Arg = DAG.getBitcast(RegVT, Arg); 2177 break; 2178 case CCValAssign::Indirect: { 2179 if (isByVal) { 2180 // Memcpy the argument to a temporary stack slot to prevent 2181 // the caller from seeing any modifications the callee may make 2182 // as guaranteed by the `byval` attribute. 2183 int FrameIdx = MF.getFrameInfo().CreateStackObject( 2184 Flags.getByValSize(), 2185 std::max(Align(16), Flags.getNonZeroByValAlign()), false); 2186 SDValue StackSlot = 2187 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); 2188 Chain = 2189 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); 2190 // From now on treat this as a regular pointer 2191 Arg = StackSlot; 2192 isByVal = false; 2193 } else { 2194 // Store the argument. 2195 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2196 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2197 Chain = DAG.getStore( 2198 Chain, dl, Arg, SpillSlot, 2199 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 2200 Arg = SpillSlot; 2201 } 2202 break; 2203 } 2204 } 2205 2206 if (VA.needsCustom()) { 2207 assert(VA.getValVT() == MVT::v64i1 && 2208 "Currently the only custom case is when we split v64i1 to 2 regs"); 2209 // Split v64i1 value into two registers 2210 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); 2211 } else if (VA.isRegLoc()) { 2212 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2213 const TargetOptions &Options = DAG.getTarget().Options; 2214 if (Options.EmitCallSiteInfo) 2215 CSInfo.emplace_back(VA.getLocReg(), I); 2216 if (isVarArg && IsWin64) { 2217 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2218 // shadow reg if callee is a varargs function. 2219 Register ShadowReg; 2220 switch (VA.getLocReg()) { 2221 case X86::XMM0: ShadowReg = X86::RCX; break; 2222 case X86::XMM1: ShadowReg = X86::RDX; break; 2223 case X86::XMM2: ShadowReg = X86::R8; break; 2224 case X86::XMM3: ShadowReg = X86::R9; break; 2225 } 2226 if (ShadowReg) 2227 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2228 } 2229 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2230 assert(VA.isMemLoc()); 2231 if (!StackPtr.getNode()) 2232 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2233 getPointerTy(DAG.getDataLayout())); 2234 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2235 dl, DAG, VA, Flags, isByVal)); 2236 } 2237 } 2238 2239 if (!MemOpChains.empty()) 2240 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2241 2242 if (Subtarget.isPICStyleGOT()) { 2243 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2244 // GOT pointer (except regcall). 2245 if (!isTailCall) { 2246 // Indirect call with RegCall calling convertion may use up all the 2247 // general registers, so it is not suitable to bind EBX reister for 2248 // GOT address, just let register allocator handle it. 2249 if (CallConv != CallingConv::X86_RegCall) 2250 RegsToPass.push_back(std::make_pair( 2251 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 2252 getPointerTy(DAG.getDataLayout())))); 2253 } else { 2254 // If we are tail calling and generating PIC/GOT style code load the 2255 // address of the callee into ECX. The value in ecx is used as target of 2256 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2257 // for tail calls on PIC/GOT architectures. Normally we would just put the 2258 // address of GOT into ebx and then call target@PLT. But for tail calls 2259 // ebx would be restored (since ebx is callee saved) before jumping to the 2260 // target@PLT. 2261 2262 // Note: The actual moving to ECX is done further down. 2263 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2264 if (G && !G->getGlobal()->hasLocalLinkage() && 2265 G->getGlobal()->hasDefaultVisibility()) 2266 Callee = LowerGlobalAddress(Callee, DAG); 2267 else if (isa<ExternalSymbolSDNode>(Callee)) 2268 Callee = LowerExternalSymbol(Callee, DAG); 2269 } 2270 } 2271 2272 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail && 2273 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) { 2274 // From AMD64 ABI document: 2275 // For calls that may call functions that use varargs or stdargs 2276 // (prototype-less calls or calls to functions containing ellipsis (...) in 2277 // the declaration) %al is used as hidden argument to specify the number 2278 // of SSE registers used. The contents of %al do not need to match exactly 2279 // the number of registers, but must be an ubound on the number of SSE 2280 // registers used and is in the range 0 - 8 inclusive. 2281 2282 // Count the number of XMM registers allocated. 2283 static const MCPhysReg XMMArgRegs[] = { 2284 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2285 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2286 }; 2287 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); 2288 assert((Subtarget.hasSSE1() || !NumXMMRegs) 2289 && "SSE registers cannot be used when SSE is disabled"); 2290 RegsToPass.push_back(std::make_pair(Register(X86::AL), 2291 DAG.getConstant(NumXMMRegs, dl, 2292 MVT::i8))); 2293 } 2294 2295 if (isVarArg && IsMustTail) { 2296 const auto &Forwards = X86Info->getForwardedMustTailRegParms(); 2297 for (const auto &F : Forwards) { 2298 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 2299 RegsToPass.push_back(std::make_pair(F.PReg, Val)); 2300 } 2301 } 2302 2303 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls 2304 // don't need this because the eligibility check rejects calls that require 2305 // shuffling arguments passed in memory. 2306 if (!IsSibcall && isTailCall) { 2307 // Force all the incoming stack arguments to be loaded from the stack 2308 // before any new outgoing arguments are stored to the stack, because the 2309 // outgoing stack slots may alias the incoming argument stack slots, and 2310 // the alias isn't otherwise explicit. This is slightly more conservative 2311 // than necessary, because it means that each store effectively depends 2312 // on every argument instead of just those arguments it would clobber. 2313 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2314 2315 SmallVector<SDValue, 8> MemOpChains2; 2316 SDValue FIN; 2317 int FI = 0; 2318 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; 2319 ++I, ++OutsIndex) { 2320 CCValAssign &VA = ArgLocs[I]; 2321 2322 if (VA.isRegLoc()) { 2323 if (VA.needsCustom()) { 2324 assert((CallConv == CallingConv::X86_RegCall) && 2325 "Expecting custom case only in regcall calling convention"); 2326 // This means that we are in special case where one argument was 2327 // passed through two register locations - Skip the next location 2328 ++I; 2329 } 2330 2331 continue; 2332 } 2333 2334 assert(VA.isMemLoc()); 2335 SDValue Arg = OutVals[OutsIndex]; 2336 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; 2337 // Skip inalloca/preallocated arguments. They don't require any work. 2338 if (Flags.isInAlloca() || Flags.isPreallocated()) 2339 continue; 2340 // Create frame index. 2341 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2342 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2343 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 2344 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2345 2346 if (Flags.isByVal()) { 2347 // Copy relative to framepointer. 2348 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); 2349 if (!StackPtr.getNode()) 2350 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2351 getPointerTy(DAG.getDataLayout())); 2352 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2353 StackPtr, Source); 2354 2355 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2356 ArgChain, 2357 Flags, DAG, dl)); 2358 } else { 2359 // Store relative to framepointer. 2360 MemOpChains2.push_back(DAG.getStore( 2361 ArgChain, dl, Arg, FIN, 2362 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); 2363 } 2364 } 2365 2366 if (!MemOpChains2.empty()) 2367 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 2368 2369 // Store the return address to the appropriate stack slot. 2370 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2371 getPointerTy(DAG.getDataLayout()), 2372 RegInfo->getSlotSize(), FPDiff, dl); 2373 } 2374 2375 // Build a sequence of copy-to-reg nodes chained together with token chain 2376 // and glue operands which copy the outgoing args into registers. 2377 SDValue InGlue; 2378 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2379 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2380 RegsToPass[i].second, InGlue); 2381 InGlue = Chain.getValue(1); 2382 } 2383 2384 if (DAG.getTarget().getCodeModel() == CodeModel::Large) { 2385 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2386 // In the 64-bit large code model, we have to make all calls 2387 // through a register, since the call instruction's 32-bit 2388 // pc-relative offset may not be large enough to hold the whole 2389 // address. 2390 } else if (Callee->getOpcode() == ISD::GlobalAddress || 2391 Callee->getOpcode() == ISD::ExternalSymbol) { 2392 // Lower direct calls to global addresses and external symbols. Setting 2393 // ForCall to true here has the effect of removing WrapperRIP when possible 2394 // to allow direct calls to be selected without first materializing the 2395 // address into a register. 2396 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); 2397 } else if (Subtarget.isTarget64BitILP32() && 2398 Callee.getValueType() == MVT::i32) { 2399 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI 2400 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); 2401 } 2402 2403 // Returns a chain & a glue for retval copy to use. 2404 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2405 SmallVector<SDValue, 8> Ops; 2406 2407 if (!IsSibcall && isTailCall && !IsMustTail) { 2408 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl); 2409 InGlue = Chain.getValue(1); 2410 } 2411 2412 Ops.push_back(Chain); 2413 Ops.push_back(Callee); 2414 2415 if (isTailCall) 2416 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32)); 2417 2418 // Add argument registers to the end of the list so that they are known live 2419 // into the call. 2420 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2421 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2422 RegsToPass[i].second.getValueType())); 2423 2424 // Add a register mask operand representing the call-preserved registers. 2425 const uint32_t *Mask = [&]() { 2426 auto AdaptedCC = CallConv; 2427 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists), 2428 // use X86_INTR calling convention because it has the same CSR mask 2429 // (same preserved registers). 2430 if (HasNCSR) 2431 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR; 2432 // If NoCalleeSavedRegisters is requested, than use GHC since it happens 2433 // to use the CSR_NoRegs_RegMask. 2434 if (CB && CB->hasFnAttr("no_callee_saved_registers")) 2435 AdaptedCC = (CallingConv::ID)CallingConv::GHC; 2436 return RegInfo->getCallPreservedMask(MF, AdaptedCC); 2437 }(); 2438 assert(Mask && "Missing call preserved mask for calling convention"); 2439 2440 // If this is an invoke in a 32-bit function using a funclet-based 2441 // personality, assume the function clobbers all registers. If an exception 2442 // is thrown, the runtime will not restore CSRs. 2443 // FIXME: Model this more precisely so that we can register allocate across 2444 // the normal edge and spill and fill across the exceptional edge. 2445 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) { 2446 const Function &CallerFn = MF.getFunction(); 2447 EHPersonality Pers = 2448 CallerFn.hasPersonalityFn() 2449 ? classifyEHPersonality(CallerFn.getPersonalityFn()) 2450 : EHPersonality::Unknown; 2451 if (isFuncletEHPersonality(Pers)) 2452 Mask = RegInfo->getNoPreservedMask(); 2453 } 2454 2455 // Define a new register mask from the existing mask. 2456 uint32_t *RegMask = nullptr; 2457 2458 // In some calling conventions we need to remove the used physical registers 2459 // from the reg mask. Create a new RegMask for such calling conventions. 2460 // RegMask for calling conventions that disable only return registers (e.g. 2461 // preserve_most) will be modified later in LowerCallResult. 2462 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR; 2463 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) { 2464 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 2465 2466 // Allocate a new Reg Mask and copy Mask. 2467 RegMask = MF.allocateRegMask(); 2468 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); 2469 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); 2470 2471 // Make sure all sub registers of the argument registers are reset 2472 // in the RegMask. 2473 if (ShouldDisableArgRegs) { 2474 for (auto const &RegPair : RegsToPass) 2475 for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first)) 2476 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); 2477 } 2478 2479 // Create the RegMask Operand according to our updated mask. 2480 Ops.push_back(DAG.getRegisterMask(RegMask)); 2481 } else { 2482 // Create the RegMask Operand according to the static mask. 2483 Ops.push_back(DAG.getRegisterMask(Mask)); 2484 } 2485 2486 if (InGlue.getNode()) 2487 Ops.push_back(InGlue); 2488 2489 if (isTailCall) { 2490 // We used to do: 2491 //// If this is the first return lowered for this function, add the regs 2492 //// to the liveout set for the function. 2493 // This isn't right, although it's probably harmless on x86; liveouts 2494 // should be computed from returns not tail calls. Consider a void 2495 // function making a tail call to a function returning int. 2496 MF.getFrameInfo().setHasTailCall(); 2497 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); 2498 2499 if (IsCFICall) 2500 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 2501 2502 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); 2503 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2504 return Ret; 2505 } 2506 2507 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { 2508 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); 2509 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { 2510 // Calls with a "clang.arc.attachedcall" bundle are special. They should be 2511 // expanded to the call, directly followed by a special marker sequence and 2512 // a call to a ObjC library function. Use the CALL_RVMARKER to do that. 2513 assert(!isTailCall && 2514 "tail calls cannot be marked with clang.arc.attachedcall"); 2515 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"); 2516 2517 // Add a target global address for the retainRV/claimRV runtime function 2518 // just before the call target. 2519 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); 2520 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2521 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT); 2522 Ops.insert(Ops.begin() + 1, GA); 2523 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops); 2524 } else { 2525 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); 2526 } 2527 2528 if (IsCFICall) 2529 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 2530 2531 InGlue = Chain.getValue(1); 2532 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2533 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2534 2535 // Save heapallocsite metadata. 2536 if (CLI.CB) 2537 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite")) 2538 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); 2539 2540 // Create the CALLSEQ_END node. 2541 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing. 2542 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2543 DAG.getTarget().Options.GuaranteedTailCallOpt)) 2544 NumBytesForCalleeToPop = NumBytes; // Callee pops everything 2545 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet) 2546 // If this call passes a struct-return pointer, the callee 2547 // pops that struct pointer. 2548 NumBytesForCalleeToPop = 4; 2549 2550 // Returns a glue for retval copy to use. 2551 if (!IsSibcall) { 2552 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop, 2553 InGlue, dl); 2554 InGlue = Chain.getValue(1); 2555 } 2556 2557 // Handle result values, copying them out of physregs into vregs that we 2558 // return. 2559 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG, 2560 InVals, RegMask); 2561 } 2562 2563 //===----------------------------------------------------------------------===// 2564 // Fast Calling Convention (tail call) implementation 2565 //===----------------------------------------------------------------------===// 2566 2567 // Like std call, callee cleans arguments, convention except that ECX is 2568 // reserved for storing the tail called function address. Only 2 registers are 2569 // free for argument passing (inreg). Tail call optimization is performed 2570 // provided: 2571 // * tailcallopt is enabled 2572 // * caller/callee are fastcc 2573 // On X86_64 architecture with GOT-style position independent code only local 2574 // (within module) calls are supported at the moment. 2575 // To keep the stack aligned according to platform abi the function 2576 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 2577 // of stack alignment. (Dynamic linkers need this - Darwin's dyld for example) 2578 // If a tail called function callee has more arguments than the caller the 2579 // caller needs to make sure that there is room to move the RETADDR to. This is 2580 // achieved by reserving an area the size of the argument delta right after the 2581 // original RETADDR, but before the saved framepointer or the spilled registers 2582 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2583 // stack layout: 2584 // arg1 2585 // arg2 2586 // RETADDR 2587 // [ new RETADDR 2588 // move area ] 2589 // (possible EBP) 2590 // ESI 2591 // EDI 2592 // local1 .. 2593 2594 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align 2595 /// requirement. 2596 unsigned 2597 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, 2598 SelectionDAG &DAG) const { 2599 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); 2600 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); 2601 assert(StackSize % SlotSize == 0 && 2602 "StackSize must be a multiple of SlotSize"); 2603 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; 2604 } 2605 2606 /// Return true if the given stack call argument is already available in the 2607 /// same position (relatively) of the caller's incoming argument stack. 2608 static 2609 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2610 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2611 const X86InstrInfo *TII, const CCValAssign &VA) { 2612 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2613 2614 for (;;) { 2615 // Look through nodes that don't alter the bits of the incoming value. 2616 unsigned Op = Arg.getOpcode(); 2617 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST || 2618 Op == ISD::AssertZext) { 2619 Arg = Arg.getOperand(0); 2620 continue; 2621 } 2622 if (Op == ISD::TRUNCATE) { 2623 const SDValue &TruncInput = Arg.getOperand(0); 2624 if (TruncInput.getOpcode() == ISD::AssertZext && 2625 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() == 2626 Arg.getValueType()) { 2627 Arg = TruncInput.getOperand(0); 2628 continue; 2629 } 2630 } 2631 break; 2632 } 2633 2634 int FI = INT_MAX; 2635 if (Arg.getOpcode() == ISD::CopyFromReg) { 2636 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2637 if (!VR.isVirtual()) 2638 return false; 2639 MachineInstr *Def = MRI->getVRegDef(VR); 2640 if (!Def) 2641 return false; 2642 if (!Flags.isByVal()) { 2643 if (!TII->isLoadFromStackSlot(*Def, FI)) 2644 return false; 2645 } else { 2646 unsigned Opcode = Def->getOpcode(); 2647 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || 2648 Opcode == X86::LEA64_32r) && 2649 Def->getOperand(1).isFI()) { 2650 FI = Def->getOperand(1).getIndex(); 2651 Bytes = Flags.getByValSize(); 2652 } else 2653 return false; 2654 } 2655 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2656 if (Flags.isByVal()) 2657 // ByVal argument is passed in as a pointer but it's now being 2658 // dereferenced. e.g. 2659 // define @foo(%struct.X* %A) { 2660 // tail call @bar(%struct.X* byval %A) 2661 // } 2662 return false; 2663 SDValue Ptr = Ld->getBasePtr(); 2664 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2665 if (!FINode) 2666 return false; 2667 FI = FINode->getIndex(); 2668 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2669 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2670 FI = FINode->getIndex(); 2671 Bytes = Flags.getByValSize(); 2672 } else 2673 return false; 2674 2675 assert(FI != INT_MAX); 2676 if (!MFI.isFixedObjectIndex(FI)) 2677 return false; 2678 2679 if (Offset != MFI.getObjectOffset(FI)) 2680 return false; 2681 2682 // If this is not byval, check that the argument stack object is immutable. 2683 // inalloca and argument copy elision can create mutable argument stack 2684 // objects. Byval objects can be mutated, but a byval call intends to pass the 2685 // mutated memory. 2686 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) 2687 return false; 2688 2689 if (VA.getLocVT().getFixedSizeInBits() > 2690 Arg.getValueSizeInBits().getFixedValue()) { 2691 // If the argument location is wider than the argument type, check that any 2692 // extension flags match. 2693 if (Flags.isZExt() != MFI.isObjectZExt(FI) || 2694 Flags.isSExt() != MFI.isObjectSExt(FI)) { 2695 return false; 2696 } 2697 } 2698 2699 return Bytes == MFI.getObjectSize(FI); 2700 } 2701 2702 /// Check whether the call is eligible for tail call optimization. Targets 2703 /// that want to do tail call optimization should implement this function. 2704 bool X86TargetLowering::IsEligibleForTailCallOptimization( 2705 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet, 2706 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs, 2707 const SmallVectorImpl<SDValue> &OutVals, 2708 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2709 if (!mayTailCallThisCC(CalleeCC)) 2710 return false; 2711 2712 // If -tailcallopt is specified, make fastcc functions tail-callable. 2713 MachineFunction &MF = DAG.getMachineFunction(); 2714 const Function &CallerF = MF.getFunction(); 2715 2716 // If the function return type is x86_fp80 and the callee return type is not, 2717 // then the FP_EXTEND of the call result is not a nop. It's not safe to 2718 // perform a tailcall optimization here. 2719 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 2720 return false; 2721 2722 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2723 bool CCMatch = CallerCC == CalleeCC; 2724 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); 2725 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); 2726 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || 2727 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail; 2728 2729 // Win64 functions have extra shadow space for argument homing. Don't do the 2730 // sibcall if the caller and callee have mismatched expectations for this 2731 // space. 2732 if (IsCalleeWin64 != IsCallerWin64) 2733 return false; 2734 2735 if (IsGuaranteeTCO) { 2736 if (canGuaranteeTCO(CalleeCC) && CCMatch) 2737 return true; 2738 return false; 2739 } 2740 2741 // Look for obvious safe cases to perform tail call optimization that do not 2742 // require ABI changes. This is what gcc calls sibcall. 2743 2744 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2745 // emit a special epilogue. 2746 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 2747 if (RegInfo->hasStackRealignment(MF)) 2748 return false; 2749 2750 // Also avoid sibcall optimization if we're an sret return fn and the callee 2751 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is 2752 // insufficient. 2753 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) { 2754 // For a compatible tail call the callee must return our sret pointer. So it 2755 // needs to be (a) an sret function itself and (b) we pass our sret as its 2756 // sret. Condition #b is harder to determine. 2757 return false; 2758 } else if (IsCalleePopSRet) 2759 // The callee pops an sret, so we cannot tail-call, as our caller doesn't 2760 // expect that. 2761 return false; 2762 2763 // Do not sibcall optimize vararg calls unless all arguments are passed via 2764 // registers. 2765 LLVMContext &C = *DAG.getContext(); 2766 if (isVarArg && !Outs.empty()) { 2767 // Optimizing for varargs on Win64 is unlikely to be safe without 2768 // additional testing. 2769 if (IsCalleeWin64 || IsCallerWin64) 2770 return false; 2771 2772 SmallVector<CCValAssign, 16> ArgLocs; 2773 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2774 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2775 for (const auto &VA : ArgLocs) 2776 if (!VA.isRegLoc()) 2777 return false; 2778 } 2779 2780 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2781 // stack. Therefore, if it's not used by the call it is not safe to optimize 2782 // this into a sibcall. 2783 bool Unused = false; 2784 for (const auto &In : Ins) { 2785 if (!In.Used) { 2786 Unused = true; 2787 break; 2788 } 2789 } 2790 if (Unused) { 2791 SmallVector<CCValAssign, 16> RVLocs; 2792 CCState CCInfo(CalleeCC, false, MF, RVLocs, C); 2793 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2794 for (const auto &VA : RVLocs) { 2795 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 2796 return false; 2797 } 2798 } 2799 2800 // Check that the call results are passed in the same way. 2801 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2802 RetCC_X86, RetCC_X86)) 2803 return false; 2804 // The callee has to preserve all registers the caller needs to preserve. 2805 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 2806 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2807 if (!CCMatch) { 2808 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2809 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2810 return false; 2811 } 2812 2813 unsigned StackArgsSize = 0; 2814 2815 // If the callee takes no arguments then go on to check the results of the 2816 // call. 2817 if (!Outs.empty()) { 2818 // Check if stack adjustment is needed. For now, do not do this if any 2819 // argument is passed on the stack. 2820 SmallVector<CCValAssign, 16> ArgLocs; 2821 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2822 2823 // Allocate shadow area for Win64 2824 if (IsCalleeWin64) 2825 CCInfo.AllocateStack(32, Align(8)); 2826 2827 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2828 StackArgsSize = CCInfo.getStackSize(); 2829 2830 if (CCInfo.getStackSize()) { 2831 // Check if the arguments are already laid out in the right way as 2832 // the caller's fixed stack objects. 2833 MachineFrameInfo &MFI = MF.getFrameInfo(); 2834 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2835 const X86InstrInfo *TII = Subtarget.getInstrInfo(); 2836 for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { 2837 const CCValAssign &VA = ArgLocs[I]; 2838 SDValue Arg = OutVals[I]; 2839 ISD::ArgFlagsTy Flags = Outs[I].Flags; 2840 if (VA.getLocInfo() == CCValAssign::Indirect) 2841 return false; 2842 if (!VA.isRegLoc()) { 2843 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, 2844 TII, VA)) 2845 return false; 2846 } 2847 } 2848 } 2849 2850 bool PositionIndependent = isPositionIndependent(); 2851 // If the tailcall address may be in a register, then make sure it's 2852 // possible to register allocate for it. In 32-bit, the call address can 2853 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2854 // callee-saved registers are restored. These happen to be the same 2855 // registers used to pass 'inreg' arguments so watch out for those. 2856 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && 2857 !isa<ExternalSymbolSDNode>(Callee)) || 2858 PositionIndependent)) { 2859 unsigned NumInRegs = 0; 2860 // In PIC we need an extra register to formulate the address computation 2861 // for the callee. 2862 unsigned MaxInRegs = PositionIndependent ? 2 : 3; 2863 2864 for (const auto &VA : ArgLocs) { 2865 if (!VA.isRegLoc()) 2866 continue; 2867 Register Reg = VA.getLocReg(); 2868 switch (Reg) { 2869 default: break; 2870 case X86::EAX: case X86::EDX: case X86::ECX: 2871 if (++NumInRegs == MaxInRegs) 2872 return false; 2873 break; 2874 } 2875 } 2876 } 2877 2878 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2879 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2880 return false; 2881 } 2882 2883 bool CalleeWillPop = 2884 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, 2885 MF.getTarget().Options.GuaranteedTailCallOpt); 2886 2887 if (unsigned BytesToPop = 2888 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { 2889 // If we have bytes to pop, the callee must pop them. 2890 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; 2891 if (!CalleePopMatches) 2892 return false; 2893 } else if (CalleeWillPop && StackArgsSize > 0) { 2894 // If we don't have bytes to pop, make sure the callee doesn't pop any. 2895 return false; 2896 } 2897 2898 return true; 2899 } 2900 2901 /// Determines whether the callee is required to pop its own arguments. 2902 /// Callee pop is necessary to support tail calls. 2903 bool X86::isCalleePop(CallingConv::ID CallingConv, 2904 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { 2905 // If GuaranteeTCO is true, we force some calls to be callee pop so that we 2906 // can guarantee TCO. 2907 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) 2908 return true; 2909 2910 switch (CallingConv) { 2911 default: 2912 return false; 2913 case CallingConv::X86_StdCall: 2914 case CallingConv::X86_FastCall: 2915 case CallingConv::X86_ThisCall: 2916 case CallingConv::X86_VectorCall: 2917 return !is64Bit; 2918 } 2919 } 2920