1 //===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file implements the lowering of LLVM calls to DAG nodes. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86CallingConv.h" 16 #include "X86FrameLowering.h" 17 #include "X86ISelLowering.h" 18 #include "X86InstrBuilder.h" 19 #include "X86MachineFunctionInfo.h" 20 #include "X86TargetMachine.h" 21 #include "X86TargetObjectFile.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/ObjCARCUtil.h" 24 #include "llvm/CodeGen/MachineJumpTableInfo.h" 25 #include "llvm/CodeGen/MachineModuleInfo.h" 26 #include "llvm/CodeGen/WinEHFuncInfo.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IRBuilder.h" 29 #include "llvm/IR/Module.h" 30 31 #define DEBUG_TYPE "x86-isel" 32 33 using namespace llvm; 34 35 STATISTIC(NumTailCalls, "Number of tail calls"); 36 37 /// Call this when the user attempts to do something unsupported, like 38 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike 39 /// report_fatal_error, so calling code should attempt to recover without 40 /// crashing. 41 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, 42 const char *Msg) { 43 MachineFunction &MF = DAG.getMachineFunction(); 44 DAG.getContext()->diagnose( 45 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); 46 } 47 48 /// Returns true if a CC can dynamically exclude a register from the list of 49 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on 50 /// the return registers. 51 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) { 52 switch (CC) { 53 default: 54 return false; 55 case CallingConv::X86_RegCall: 56 case CallingConv::PreserveMost: 57 case CallingConv::PreserveAll: 58 return true; 59 } 60 } 61 62 /// Returns true if a CC can dynamically exclude a register from the list of 63 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on 64 /// the parameters. 65 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) { 66 return CC == CallingConv::X86_RegCall; 67 } 68 69 static std::pair<MVT, unsigned> 70 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, 71 const X86Subtarget &Subtarget) { 72 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling 73 // convention is one that uses k registers. 74 if (NumElts == 2) 75 return {MVT::v2i64, 1}; 76 if (NumElts == 4) 77 return {MVT::v4i32, 1}; 78 if (NumElts == 8 && CC != CallingConv::X86_RegCall && 79 CC != CallingConv::Intel_OCL_BI) 80 return {MVT::v8i16, 1}; 81 if (NumElts == 16 && CC != CallingConv::X86_RegCall && 82 CC != CallingConv::Intel_OCL_BI) 83 return {MVT::v16i8, 1}; 84 // v32i1 passes in ymm unless we have BWI and the calling convention is 85 // regcall. 86 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) 87 return {MVT::v32i8, 1}; 88 // Split v64i1 vectors if we don't have v64i8 available. 89 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { 90 if (Subtarget.useAVX512Regs()) 91 return {MVT::v64i8, 1}; 92 return {MVT::v32i8, 2}; 93 } 94 95 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. 96 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || 97 NumElts > 64) 98 return {MVT::i8, NumElts}; 99 100 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; 101 } 102 103 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 104 CallingConv::ID CC, 105 EVT VT) const { 106 if (VT.isVector()) { 107 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { 108 unsigned NumElts = VT.getVectorNumElements(); 109 110 MVT RegisterVT; 111 unsigned NumRegisters; 112 std::tie(RegisterVT, NumRegisters) = 113 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); 114 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) 115 return RegisterVT; 116 } 117 118 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) 119 return MVT::v8f16; 120 } 121 122 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. 123 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && 124 !Subtarget.hasX87()) 125 return MVT::i32; 126 127 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) 128 return getRegisterTypeForCallingConv(Context, CC, 129 VT.changeVectorElementType(MVT::f16)); 130 131 if (VT == MVT::bf16) 132 return MVT::f16; 133 134 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 135 } 136 137 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 138 CallingConv::ID CC, 139 EVT VT) const { 140 if (VT.isVector()) { 141 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { 142 unsigned NumElts = VT.getVectorNumElements(); 143 144 MVT RegisterVT; 145 unsigned NumRegisters; 146 std::tie(RegisterVT, NumRegisters) = 147 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); 148 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) 149 return NumRegisters; 150 } 151 152 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) 153 return 1; 154 } 155 156 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if 157 // x87 is disabled. 158 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) { 159 if (VT == MVT::f64) 160 return 2; 161 if (VT == MVT::f80) 162 return 3; 163 } 164 165 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) 166 return getNumRegistersForCallingConv(Context, CC, 167 VT.changeVectorElementType(MVT::f16)); 168 169 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 170 } 171 172 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( 173 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 174 unsigned &NumIntermediates, MVT &RegisterVT) const { 175 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. 176 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && 177 Subtarget.hasAVX512() && 178 (!isPowerOf2_32(VT.getVectorNumElements()) || 179 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || 180 VT.getVectorNumElements() > 64)) { 181 RegisterVT = MVT::i8; 182 IntermediateVT = MVT::i1; 183 NumIntermediates = VT.getVectorNumElements(); 184 return NumIntermediates; 185 } 186 187 // Split v64i1 vectors if we don't have v64i8 available. 188 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && 189 CC != CallingConv::X86_RegCall) { 190 RegisterVT = MVT::v32i8; 191 IntermediateVT = MVT::v32i1; 192 NumIntermediates = 2; 193 return 2; 194 } 195 196 // Split vNbf16 vectors according to vNf16. 197 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) 198 VT = VT.changeVectorElementType(MVT::f16); 199 200 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, 201 NumIntermediates, RegisterVT); 202 } 203 204 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, 205 LLVMContext& Context, 206 EVT VT) const { 207 if (!VT.isVector()) 208 return MVT::i8; 209 210 if (Subtarget.hasAVX512()) { 211 // Figure out what this type will be legalized to. 212 EVT LegalVT = VT; 213 while (getTypeAction(Context, LegalVT) != TypeLegal) 214 LegalVT = getTypeToTransformTo(Context, LegalVT); 215 216 // If we got a 512-bit vector then we'll definitely have a vXi1 compare. 217 if (LegalVT.getSimpleVT().is512BitVector()) 218 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); 219 220 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { 221 // If we legalized to less than a 512-bit vector, then we will use a vXi1 222 // compare for vXi32/vXi64 for sure. If we have BWI we will also support 223 // vXi16/vXi8. 224 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); 225 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) 226 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); 227 } 228 } 229 230 return VT.changeVectorElementTypeToInteger(); 231 } 232 233 /// Helper for getByValTypeAlignment to determine 234 /// the desired ByVal argument alignment. 235 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { 236 if (MaxAlign == 16) 237 return; 238 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 239 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128) 240 MaxAlign = Align(16); 241 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 242 Align EltAlign; 243 getMaxByValAlign(ATy->getElementType(), EltAlign); 244 if (EltAlign > MaxAlign) 245 MaxAlign = EltAlign; 246 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 247 for (auto *EltTy : STy->elements()) { 248 Align EltAlign; 249 getMaxByValAlign(EltTy, EltAlign); 250 if (EltAlign > MaxAlign) 251 MaxAlign = EltAlign; 252 if (MaxAlign == 16) 253 break; 254 } 255 } 256 } 257 258 /// Return the desired alignment for ByVal aggregate 259 /// function arguments in the caller parameter area. For X86, aggregates 260 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 261 /// are at 4-byte boundaries. 262 uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty, 263 const DataLayout &DL) const { 264 if (Subtarget.is64Bit()) { 265 // Max of 8 and alignment of type. 266 Align TyAlign = DL.getABITypeAlign(Ty); 267 if (TyAlign > 8) 268 return TyAlign.value(); 269 return 8; 270 } 271 272 Align Alignment(4); 273 if (Subtarget.hasSSE1()) 274 getMaxByValAlign(Ty, Alignment); 275 return Alignment.value(); 276 } 277 278 /// It returns EVT::Other if the type should be determined using generic 279 /// target-independent logic. 280 /// For vector ops we check that the overall size isn't larger than our 281 /// preferred vector width. 282 EVT X86TargetLowering::getOptimalMemOpType( 283 const MemOp &Op, const AttributeList &FuncAttributes) const { 284 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { 285 if (Op.size() >= 16 && 286 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { 287 // FIXME: Check if unaligned 64-byte accesses are slow. 288 if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() && 289 (Subtarget.getPreferVectorWidth() >= 512)) { 290 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; 291 } 292 // FIXME: Check if unaligned 32-byte accesses are slow. 293 if (Op.size() >= 32 && Subtarget.hasAVX() && 294 Subtarget.useLight256BitInstructions()) { 295 // Although this isn't a well-supported type for AVX1, we'll let 296 // legalization and shuffle lowering produce the optimal codegen. If we 297 // choose an optimal type with a vector element larger than a byte, 298 // getMemsetStores() may create an intermediate splat (using an integer 299 // multiply) before we splat as a vector. 300 return MVT::v32i8; 301 } 302 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) 303 return MVT::v16i8; 304 // TODO: Can SSE1 handle a byte vector? 305 // If we have SSE1 registers we should be able to use them. 306 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && 307 (Subtarget.getPreferVectorWidth() >= 128)) 308 return MVT::v4f32; 309 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) && 310 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { 311 // Do not use f64 to lower memcpy if source is string constant. It's 312 // better to use i32 to avoid the loads. 313 // Also, do not use f64 to lower memset unless this is a memset of zeros. 314 // The gymnastics of splatting a byte value into an XMM register and then 315 // only using 8-byte stores (because this is a CPU with slow unaligned 316 // 16-byte accesses) makes that a loser. 317 return MVT::f64; 318 } 319 } 320 // This is a compromise. If we reach here, unaligned accesses may be slow on 321 // this target. However, creating smaller, aligned accesses could be even 322 // slower and would certainly be a lot more code. 323 if (Subtarget.is64Bit() && Op.size() >= 8) 324 return MVT::i64; 325 return MVT::i32; 326 } 327 328 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 329 if (VT == MVT::f32) 330 return Subtarget.hasSSE1(); 331 if (VT == MVT::f64) 332 return Subtarget.hasSSE2(); 333 return true; 334 } 335 336 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) { 337 return (8 * Alignment.value()) % SizeInBits == 0; 338 } 339 340 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const { 341 if (isBitAligned(Alignment, VT.getSizeInBits())) 342 return true; 343 switch (VT.getSizeInBits()) { 344 default: 345 // 8-byte and under are always assumed to be fast. 346 return true; 347 case 128: 348 return !Subtarget.isUnalignedMem16Slow(); 349 case 256: 350 return !Subtarget.isUnalignedMem32Slow(); 351 // TODO: What about AVX-512 (512-bit) accesses? 352 } 353 } 354 355 bool X86TargetLowering::allowsMisalignedMemoryAccesses( 356 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags, 357 unsigned *Fast) const { 358 if (Fast) 359 *Fast = isMemoryAccessFast(VT, Alignment); 360 // NonTemporal vector memory ops must be aligned. 361 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { 362 // NT loads can only be vector aligned, so if its less aligned than the 363 // minimum vector size (which we can split the vector down to), we might as 364 // well use a regular unaligned vector load. 365 // We don't have any NT loads pre-SSE41. 366 if (!!(Flags & MachineMemOperand::MOLoad)) 367 return (Alignment < 16 || !Subtarget.hasSSE41()); 368 return false; 369 } 370 // Misaligned accesses of any size are always allowed. 371 return true; 372 } 373 374 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, 375 const DataLayout &DL, EVT VT, 376 unsigned AddrSpace, Align Alignment, 377 MachineMemOperand::Flags Flags, 378 unsigned *Fast) const { 379 if (Fast) 380 *Fast = isMemoryAccessFast(VT, Alignment); 381 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { 382 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, 383 /*Fast=*/nullptr)) 384 return true; 385 // NonTemporal vector memory ops are special, and must be aligned. 386 if (!isBitAligned(Alignment, VT.getSizeInBits())) 387 return false; 388 switch (VT.getSizeInBits()) { 389 case 128: 390 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41()) 391 return true; 392 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2()) 393 return true; 394 return false; 395 case 256: 396 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2()) 397 return true; 398 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX()) 399 return true; 400 return false; 401 case 512: 402 if (Subtarget.hasAVX512() && Subtarget.hasEVEX512()) 403 return true; 404 return false; 405 default: 406 return false; // Don't have NonTemporal vector memory ops of this size. 407 } 408 } 409 return true; 410 } 411 412 /// Return the entry encoding for a jump table in the 413 /// current function. The returned value is a member of the 414 /// MachineJumpTableInfo::JTEntryKind enum. 415 unsigned X86TargetLowering::getJumpTableEncoding() const { 416 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 417 // symbol. 418 if (isPositionIndependent() && Subtarget.isPICStyleGOT()) 419 return MachineJumpTableInfo::EK_Custom32; 420 if (isPositionIndependent() && 421 getTargetMachine().getCodeModel() == CodeModel::Large && 422 !Subtarget.isTargetCOFF()) 423 return MachineJumpTableInfo::EK_LabelDifference64; 424 425 // Otherwise, use the normal jump table encoding heuristics. 426 return TargetLowering::getJumpTableEncoding(); 427 } 428 429 bool X86TargetLowering::useSoftFloat() const { 430 return Subtarget.useSoftFloat(); 431 } 432 433 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, 434 ArgListTy &Args) const { 435 436 // Only relabel X86-32 for C / Stdcall CCs. 437 if (Subtarget.is64Bit()) 438 return; 439 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) 440 return; 441 unsigned ParamRegs = 0; 442 if (auto *M = MF->getFunction().getParent()) 443 ParamRegs = M->getNumberRegisterParameters(); 444 445 // Mark the first N int arguments as having reg 446 for (auto &Arg : Args) { 447 Type *T = Arg.Ty; 448 if (T->isIntOrPtrTy()) 449 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { 450 unsigned numRegs = 1; 451 if (MF->getDataLayout().getTypeAllocSize(T) > 4) 452 numRegs = 2; 453 if (ParamRegs < numRegs) 454 return; 455 ParamRegs -= numRegs; 456 Arg.IsInReg = true; 457 } 458 } 459 } 460 461 const MCExpr * 462 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 463 const MachineBasicBlock *MBB, 464 unsigned uid,MCContext &Ctx) const{ 465 assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); 466 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 467 // entries. 468 return MCSymbolRefExpr::create(MBB->getSymbol(), 469 MCSymbolRefExpr::VK_GOTOFF, Ctx); 470 } 471 472 /// Returns relocation base for the given PIC jumptable. 473 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 474 SelectionDAG &DAG) const { 475 if (!Subtarget.is64Bit()) 476 // This doesn't have SDLoc associated with it, but is not really the 477 // same as a Register. 478 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 479 getPointerTy(DAG.getDataLayout())); 480 return Table; 481 } 482 483 /// This returns the relocation base for the given PIC jumptable, 484 /// the same as getPICJumpTableRelocBase, but as an MCExpr. 485 const MCExpr *X86TargetLowering:: 486 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 487 MCContext &Ctx) const { 488 // X86-64 uses RIP relative addressing based on the jump table label. 489 if (Subtarget.isPICStyleRIPRel() || 490 (Subtarget.is64Bit() && 491 getTargetMachine().getCodeModel() == CodeModel::Large)) 492 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 493 494 // Otherwise, the reference is relative to the PIC base. 495 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 496 } 497 498 std::pair<const TargetRegisterClass *, uint8_t> 499 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 500 MVT VT) const { 501 const TargetRegisterClass *RRC = nullptr; 502 uint8_t Cost = 1; 503 switch (VT.SimpleTy) { 504 default: 505 return TargetLowering::findRepresentativeClass(TRI, VT); 506 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 507 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; 508 break; 509 case MVT::x86mmx: 510 RRC = &X86::VR64RegClass; 511 break; 512 case MVT::f32: case MVT::f64: 513 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 514 case MVT::v4f32: case MVT::v2f64: 515 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: 516 case MVT::v8f32: case MVT::v4f64: 517 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: 518 case MVT::v16f32: case MVT::v8f64: 519 RRC = &X86::VR128XRegClass; 520 break; 521 } 522 return std::make_pair(RRC, Cost); 523 } 524 525 unsigned X86TargetLowering::getAddressSpace() const { 526 if (Subtarget.is64Bit()) 527 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; 528 return 256; 529 } 530 531 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { 532 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || 533 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); 534 } 535 536 static Constant* SegmentOffset(IRBuilderBase &IRB, 537 int Offset, unsigned AddressSpace) { 538 return ConstantExpr::getIntToPtr( 539 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), 540 IRB.getPtrTy(AddressSpace)); 541 } 542 543 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { 544 // glibc, bionic, and Fuchsia have a special slot for the stack guard in 545 // tcbhead_t; use it instead of the usual global variable (see 546 // sysdeps/{i386,x86_64}/nptl/tls.h) 547 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { 548 unsigned AddressSpace = getAddressSpace(); 549 550 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 551 if (Subtarget.isTargetFuchsia()) 552 return SegmentOffset(IRB, 0x10, AddressSpace); 553 554 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 555 // Specially, some users may customize the base reg and offset. 556 int Offset = M->getStackProtectorGuardOffset(); 557 // If we don't set -stack-protector-guard-offset value: 558 // %fs:0x28, unless we're using a Kernel code model, in which case 559 // it's %gs:0x28. gs:0x14 on i386. 560 if (Offset == INT_MAX) 561 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; 562 563 StringRef GuardReg = M->getStackProtectorGuardReg(); 564 if (GuardReg == "fs") 565 AddressSpace = X86AS::FS; 566 else if (GuardReg == "gs") 567 AddressSpace = X86AS::GS; 568 569 // Use symbol guard if user specify. 570 StringRef GuardSymb = M->getStackProtectorGuardSymbol(); 571 if (!GuardSymb.empty()) { 572 GlobalVariable *GV = M->getGlobalVariable(GuardSymb); 573 if (!GV) { 574 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) 575 : Type::getInt32Ty(M->getContext()); 576 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 577 nullptr, GuardSymb, nullptr, 578 GlobalValue::NotThreadLocal, AddressSpace); 579 if (!Subtarget.isTargetDarwin()) 580 GV->setDSOLocal(M->getDirectAccessExternalData()); 581 } 582 return GV; 583 } 584 585 return SegmentOffset(IRB, Offset, AddressSpace); 586 } 587 return TargetLowering::getIRStackGuard(IRB); 588 } 589 590 void X86TargetLowering::insertSSPDeclarations(Module &M) const { 591 // MSVC CRT provides functionalities for stack protection. 592 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 593 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 594 // MSVC CRT has a global variable holding security cookie. 595 M.getOrInsertGlobal("__security_cookie", 596 PointerType::getUnqual(M.getContext())); 597 598 // MSVC CRT has a function to validate security cookie. 599 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 600 "__security_check_cookie", Type::getVoidTy(M.getContext()), 601 PointerType::getUnqual(M.getContext())); 602 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 603 F->setCallingConv(CallingConv::X86_FastCall); 604 F->addParamAttr(0, Attribute::AttrKind::InReg); 605 } 606 return; 607 } 608 609 StringRef GuardMode = M.getStackProtectorGuard(); 610 611 // glibc, bionic, and Fuchsia have a special slot for the stack guard. 612 if ((GuardMode == "tls" || GuardMode.empty()) && 613 hasStackGuardSlotTLS(Subtarget.getTargetTriple())) 614 return; 615 TargetLowering::insertSSPDeclarations(M); 616 } 617 618 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { 619 // MSVC CRT has a global variable holding security cookie. 620 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 621 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 622 return M.getGlobalVariable("__security_cookie"); 623 } 624 return TargetLowering::getSDagStackGuard(M); 625 } 626 627 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { 628 // MSVC CRT has a function to validate security cookie. 629 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || 630 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { 631 return M.getFunction("__security_check_cookie"); 632 } 633 return TargetLowering::getSSPStackGuardCheck(M); 634 } 635 636 Value * 637 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { 638 // Android provides a fixed TLS slot for the SafeStack pointer. See the 639 // definition of TLS_SLOT_SAFESTACK in 640 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 641 if (Subtarget.isTargetAndroid()) { 642 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: 643 // %gs:0x24 on i386 644 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; 645 return SegmentOffset(IRB, Offset, getAddressSpace()); 646 } 647 648 // Fuchsia is similar. 649 if (Subtarget.isTargetFuchsia()) { 650 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 651 return SegmentOffset(IRB, 0x18, getAddressSpace()); 652 } 653 654 return TargetLowering::getSafeStackPointerLocation(IRB); 655 } 656 657 //===----------------------------------------------------------------------===// 658 // Return Value Calling Convention Implementation 659 //===----------------------------------------------------------------------===// 660 661 bool X86TargetLowering::CanLowerReturn( 662 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 663 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 664 SmallVector<CCValAssign, 16> RVLocs; 665 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 666 return CCInfo.CheckReturn(Outs, RetCC_X86); 667 } 668 669 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 670 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; 671 return ScratchRegs; 672 } 673 674 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const { 675 static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR}; 676 return RCRegs; 677 } 678 679 /// Lowers masks values (v*i1) to the local register values 680 /// \returns DAG node after lowering to register type 681 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, 682 const SDLoc &DL, SelectionDAG &DAG) { 683 EVT ValVT = ValArg.getValueType(); 684 685 if (ValVT == MVT::v1i1) 686 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ValLoc, ValArg, 687 DAG.getIntPtrConstant(0, DL)); 688 689 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || 690 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { 691 // Two stage lowering might be required 692 // bitcast: v8i1 -> i8 / v16i1 -> i16 693 // anyextend: i8 -> i32 / i16 -> i32 694 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; 695 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); 696 if (ValLoc == MVT::i32) 697 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValToCopy); 698 return ValToCopy; 699 } 700 701 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || 702 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { 703 // One stage lowering is required 704 // bitcast: v32i1 -> i32 / v64i1 -> i64 705 return DAG.getBitcast(ValLoc, ValArg); 706 } 707 708 return DAG.getNode(ISD::ANY_EXTEND, DL, ValLoc, ValArg); 709 } 710 711 /// Breaks v64i1 value into two registers and adds the new node to the DAG 712 static void Passv64i1ArgInRegs( 713 const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg, 714 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA, 715 CCValAssign &NextVA, const X86Subtarget &Subtarget) { 716 assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); 717 assert(Subtarget.is32Bit() && "Expecting 32 bit target"); 718 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); 719 assert(VA.isRegLoc() && NextVA.isRegLoc() && 720 "The value should reside in two registers"); 721 722 // Before splitting the value we cast it to i64 723 Arg = DAG.getBitcast(MVT::i64, Arg); 724 725 // Splitting the value into two i32 types 726 SDValue Lo, Hi; 727 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, DL, MVT::i32, MVT::i32); 728 729 // Attach the two i32 types into corresponding registers 730 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); 731 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); 732 } 733 734 SDValue 735 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 736 bool isVarArg, 737 const SmallVectorImpl<ISD::OutputArg> &Outs, 738 const SmallVectorImpl<SDValue> &OutVals, 739 const SDLoc &dl, SelectionDAG &DAG) const { 740 MachineFunction &MF = DAG.getMachineFunction(); 741 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 742 743 // In some cases we need to disable registers from the default CSR list. 744 // For example, when they are used as return registers (preserve_* and X86's 745 // regcall) or for argument passing (X86's regcall). 746 bool ShouldDisableCalleeSavedRegister = 747 shouldDisableRetRegFromCSR(CallConv) || 748 MF.getFunction().hasFnAttribute("no_caller_saved_registers"); 749 750 if (CallConv == CallingConv::X86_INTR && !Outs.empty()) 751 report_fatal_error("X86 interrupts may not return any value"); 752 753 SmallVector<CCValAssign, 16> RVLocs; 754 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 755 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 756 757 SmallVector<std::pair<Register, SDValue>, 4> RetVals; 758 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; 759 ++I, ++OutsIndex) { 760 CCValAssign &VA = RVLocs[I]; 761 assert(VA.isRegLoc() && "Can only return in registers!"); 762 763 // Add the register to the CalleeSaveDisableRegs list. 764 if (ShouldDisableCalleeSavedRegister) 765 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); 766 767 SDValue ValToCopy = OutVals[OutsIndex]; 768 EVT ValVT = ValToCopy.getValueType(); 769 770 // Promote values to the appropriate types. 771 if (VA.getLocInfo() == CCValAssign::SExt) 772 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 773 else if (VA.getLocInfo() == CCValAssign::ZExt) 774 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 775 else if (VA.getLocInfo() == CCValAssign::AExt) { 776 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) 777 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); 778 else 779 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 780 } 781 else if (VA.getLocInfo() == CCValAssign::BCvt) 782 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); 783 784 assert(VA.getLocInfo() != CCValAssign::FPExt && 785 "Unexpected FP-extend for return value."); 786 787 // Report an error if we have attempted to return a value via an XMM 788 // register and SSE was disabled. 789 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { 790 errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); 791 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 792 } else if (!Subtarget.hasSSE2() && 793 X86::FR64XRegClass.contains(VA.getLocReg()) && 794 ValVT == MVT::f64) { 795 // When returning a double via an XMM register, report an error if SSE2 is 796 // not enabled. 797 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); 798 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 799 } 800 801 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 802 // the RET instruction and handled by the FP Stackifier. 803 if (VA.getLocReg() == X86::FP0 || 804 VA.getLocReg() == X86::FP1) { 805 // If this is a copy from an xmm register to ST(0), use an FPExtend to 806 // change the value to the FP stack register class. 807 if (isScalarFPTypeInSSEReg(VA.getValVT())) 808 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 809 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); 810 // Don't emit a copytoreg. 811 continue; 812 } 813 814 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 815 // which is returned in RAX / RDX. 816 if (Subtarget.is64Bit()) { 817 if (ValVT == MVT::x86mmx) { 818 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 819 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); 820 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 821 ValToCopy); 822 // If we don't have SSE2 available, convert to v4f32 so the generated 823 // register is legal. 824 if (!Subtarget.hasSSE2()) 825 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); 826 } 827 } 828 } 829 830 if (VA.needsCustom()) { 831 assert(VA.getValVT() == MVT::v64i1 && 832 "Currently the only custom case is when we split v64i1 to 2 regs"); 833 834 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I], 835 Subtarget); 836 837 // Add the second register to the CalleeSaveDisableRegs list. 838 if (ShouldDisableCalleeSavedRegister) 839 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); 840 } else { 841 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); 842 } 843 } 844 845 SDValue Glue; 846 SmallVector<SDValue, 6> RetOps; 847 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 848 // Operand #1 = Bytes To Pop 849 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, 850 MVT::i32)); 851 852 // Copy the result values into the output registers. 853 for (auto &RetVal : RetVals) { 854 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) { 855 RetOps.push_back(RetVal.second); 856 continue; // Don't emit a copytoreg. 857 } 858 859 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue); 860 Glue = Chain.getValue(1); 861 RetOps.push_back( 862 DAG.getRegister(RetVal.first, RetVal.second.getValueType())); 863 } 864 865 // Swift calling convention does not require we copy the sret argument 866 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. 867 868 // All x86 ABIs require that for returning structs by value we copy 869 // the sret argument into %rax/%eax (depending on ABI) for the return. 870 // We saved the argument into a virtual register in the entry block, 871 // so now we copy the value out and into %rax/%eax. 872 // 873 // Checking Function.hasStructRetAttr() here is insufficient because the IR 874 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is 875 // false, then an sret argument may be implicitly inserted in the SelDAG. In 876 // either case FuncInfo->setSRetReturnReg() will have been called. 877 if (Register SRetReg = FuncInfo->getSRetReturnReg()) { 878 // When we have both sret and another return value, we should use the 879 // original Chain stored in RetOps[0], instead of the current Chain updated 880 // in the above loop. If we only have sret, RetOps[0] equals to Chain. 881 882 // For the case of sret and another return value, we have 883 // Chain_0 at the function entry 884 // Chain_1 = getCopyToReg(Chain_0) in the above loop 885 // If we use Chain_1 in getCopyFromReg, we will have 886 // Val = getCopyFromReg(Chain_1) 887 // Chain_2 = getCopyToReg(Chain_1, Val) from below 888 889 // getCopyToReg(Chain_0) will be glued together with 890 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be 891 // in Unit B, and we will have cyclic dependency between Unit A and Unit B: 892 // Data dependency from Unit B to Unit A due to usage of Val in 893 // getCopyToReg(Chain_1, Val) 894 // Chain dependency from Unit A to Unit B 895 896 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. 897 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, 898 getPointerTy(MF.getDataLayout())); 899 900 Register RetValReg 901 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? 902 X86::RAX : X86::EAX; 903 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue); 904 Glue = Chain.getValue(1); 905 906 // RAX/EAX now acts like a return value. 907 RetOps.push_back( 908 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 909 910 // Add the returned register to the CalleeSaveDisableRegs list. Don't do 911 // this however for preserve_most/preserve_all to minimize the number of 912 // callee-saved registers for these CCs. 913 if (ShouldDisableCalleeSavedRegister && 914 CallConv != CallingConv::PreserveAll && 915 CallConv != CallingConv::PreserveMost) 916 MF.getRegInfo().disableCalleeSavedRegister(RetValReg); 917 } 918 919 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 920 const MCPhysReg *I = 921 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 922 if (I) { 923 for (; *I; ++I) { 924 if (X86::GR64RegClass.contains(*I)) 925 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 926 else 927 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 928 } 929 } 930 931 RetOps[0] = Chain; // Update chain. 932 933 // Add the glue if we have it. 934 if (Glue.getNode()) 935 RetOps.push_back(Glue); 936 937 X86ISD::NodeType opcode = X86ISD::RET_GLUE; 938 if (CallConv == CallingConv::X86_INTR) 939 opcode = X86ISD::IRET; 940 return DAG.getNode(opcode, dl, MVT::Other, RetOps); 941 } 942 943 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 944 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) 945 return false; 946 947 SDValue TCChain = Chain; 948 SDNode *Copy = *N->use_begin(); 949 if (Copy->getOpcode() == ISD::CopyToReg) { 950 // If the copy has a glue operand, we conservatively assume it isn't safe to 951 // perform a tail call. 952 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 953 return false; 954 TCChain = Copy->getOperand(0); 955 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 956 return false; 957 958 bool HasRet = false; 959 for (const SDNode *U : Copy->uses()) { 960 if (U->getOpcode() != X86ISD::RET_GLUE) 961 return false; 962 // If we are returning more than one value, we can definitely 963 // not make a tail call see PR19530 964 if (U->getNumOperands() > 4) 965 return false; 966 if (U->getNumOperands() == 4 && 967 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue) 968 return false; 969 HasRet = true; 970 } 971 972 if (!HasRet) 973 return false; 974 975 Chain = TCChain; 976 return true; 977 } 978 979 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 980 ISD::NodeType ExtendKind) const { 981 MVT ReturnMVT = MVT::i32; 982 983 bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); 984 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { 985 // The ABI does not require i1, i8 or i16 to be extended. 986 // 987 // On Darwin, there is code in the wild relying on Clang's old behaviour of 988 // always extending i8/i16 return values, so keep doing that for now. 989 // (PR26665). 990 ReturnMVT = MVT::i8; 991 } 992 993 EVT MinVT = getRegisterType(Context, ReturnMVT); 994 return VT.bitsLT(MinVT) ? MinVT : VT; 995 } 996 997 /// Reads two 32 bit registers and creates a 64 bit mask value. 998 /// \param VA The current 32 bit value that need to be assigned. 999 /// \param NextVA The next 32 bit value that need to be assigned. 1000 /// \param Root The parent DAG node. 1001 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for 1002 /// glue purposes. In the case the DAG is already using 1003 /// physical register instead of virtual, we should glue 1004 /// our new SDValue to InGlue SDvalue. 1005 /// \return a new SDvalue of size 64bit. 1006 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, 1007 SDValue &Root, SelectionDAG &DAG, 1008 const SDLoc &DL, const X86Subtarget &Subtarget, 1009 SDValue *InGlue = nullptr) { 1010 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); 1011 assert(Subtarget.is32Bit() && "Expecting 32 bit target"); 1012 assert(VA.getValVT() == MVT::v64i1 && 1013 "Expecting first location of 64 bit width type"); 1014 assert(NextVA.getValVT() == VA.getValVT() && 1015 "The locations should have the same type"); 1016 assert(VA.isRegLoc() && NextVA.isRegLoc() && 1017 "The values should reside in two registers"); 1018 1019 SDValue Lo, Hi; 1020 SDValue ArgValueLo, ArgValueHi; 1021 1022 MachineFunction &MF = DAG.getMachineFunction(); 1023 const TargetRegisterClass *RC = &X86::GR32RegClass; 1024 1025 // Read a 32 bit value from the registers. 1026 if (nullptr == InGlue) { 1027 // When no physical register is present, 1028 // create an intermediate virtual register. 1029 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 1030 ArgValueLo = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32); 1031 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 1032 ArgValueHi = DAG.getCopyFromReg(Root, DL, Reg, MVT::i32); 1033 } else { 1034 // When a physical register is available read the value from it and glue 1035 // the reads together. 1036 ArgValueLo = 1037 DAG.getCopyFromReg(Root, DL, VA.getLocReg(), MVT::i32, *InGlue); 1038 *InGlue = ArgValueLo.getValue(2); 1039 ArgValueHi = 1040 DAG.getCopyFromReg(Root, DL, NextVA.getLocReg(), MVT::i32, *InGlue); 1041 *InGlue = ArgValueHi.getValue(2); 1042 } 1043 1044 // Convert the i32 type into v32i1 type. 1045 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); 1046 1047 // Convert the i32 type into v32i1 type. 1048 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); 1049 1050 // Concatenate the two values together. 1051 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v64i1, Lo, Hi); 1052 } 1053 1054 /// The function will lower a register of various sizes (8/16/32/64) 1055 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) 1056 /// \returns a DAG node contains the operand after lowering to mask type. 1057 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, 1058 const EVT &ValLoc, const SDLoc &DL, 1059 SelectionDAG &DAG) { 1060 SDValue ValReturned = ValArg; 1061 1062 if (ValVT == MVT::v1i1) 1063 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, ValReturned); 1064 1065 if (ValVT == MVT::v64i1) { 1066 // In 32 bit machine, this case is handled by getv64i1Argument 1067 assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); 1068 // In 64 bit machine, There is no need to truncate the value only bitcast 1069 } else { 1070 MVT MaskLenVT; 1071 switch (ValVT.getSimpleVT().SimpleTy) { 1072 case MVT::v8i1: 1073 MaskLenVT = MVT::i8; 1074 break; 1075 case MVT::v16i1: 1076 MaskLenVT = MVT::i16; 1077 break; 1078 case MVT::v32i1: 1079 MaskLenVT = MVT::i32; 1080 break; 1081 default: 1082 llvm_unreachable("Expecting a vector of i1 types"); 1083 } 1084 1085 ValReturned = DAG.getNode(ISD::TRUNCATE, DL, MaskLenVT, ValReturned); 1086 } 1087 return DAG.getBitcast(ValVT, ValReturned); 1088 } 1089 1090 /// Lower the result values of a call into the 1091 /// appropriate copies out of appropriate physical registers. 1092 /// 1093 SDValue X86TargetLowering::LowerCallResult( 1094 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, 1095 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1096 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 1097 uint32_t *RegMask) const { 1098 1099 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 1100 // Assign locations to each value returned by this call. 1101 SmallVector<CCValAssign, 16> RVLocs; 1102 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1103 *DAG.getContext()); 1104 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1105 1106 // Copy all of the result registers out of their specified physreg. 1107 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; 1108 ++I, ++InsIndex) { 1109 CCValAssign &VA = RVLocs[I]; 1110 EVT CopyVT = VA.getLocVT(); 1111 1112 // In some calling conventions we need to remove the used registers 1113 // from the register mask. 1114 if (RegMask) { 1115 for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg())) 1116 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); 1117 } 1118 1119 // Report an error if there was an attempt to return FP values via XMM 1120 // registers. 1121 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { 1122 errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); 1123 if (VA.getLocReg() == X86::XMM1) 1124 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. 1125 else 1126 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 1127 } else if (!Subtarget.hasSSE2() && 1128 X86::FR64XRegClass.contains(VA.getLocReg()) && 1129 CopyVT == MVT::f64) { 1130 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); 1131 if (VA.getLocReg() == X86::XMM1) 1132 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. 1133 else 1134 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. 1135 } 1136 1137 // If we prefer to use the value in xmm registers, copy it out as f80 and 1138 // use a truncate to move it from fp stack reg to xmm reg. 1139 bool RoundAfterCopy = false; 1140 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && 1141 isScalarFPTypeInSSEReg(VA.getValVT())) { 1142 if (!Subtarget.hasX87()) 1143 report_fatal_error("X87 register return with X87 disabled"); 1144 CopyVT = MVT::f80; 1145 RoundAfterCopy = (CopyVT != VA.getLocVT()); 1146 } 1147 1148 SDValue Val; 1149 if (VA.needsCustom()) { 1150 assert(VA.getValVT() == MVT::v64i1 && 1151 "Currently the only custom case is when we split v64i1 to 2 regs"); 1152 Val = 1153 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue); 1154 } else { 1155 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue) 1156 .getValue(1); 1157 Val = Chain.getValue(0); 1158 InGlue = Chain.getValue(2); 1159 } 1160 1161 if (RoundAfterCopy) 1162 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1163 // This truncation won't change the value. 1164 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true)); 1165 1166 if (VA.isExtInLoc()) { 1167 if (VA.getValVT().isVector() && 1168 VA.getValVT().getScalarType() == MVT::i1 && 1169 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || 1170 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { 1171 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 1172 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); 1173 } else 1174 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 1175 } 1176 1177 if (VA.getLocInfo() == CCValAssign::BCvt) 1178 Val = DAG.getBitcast(VA.getValVT(), Val); 1179 1180 InVals.push_back(Val); 1181 } 1182 1183 return Chain; 1184 } 1185 1186 //===----------------------------------------------------------------------===// 1187 // C & StdCall & Fast Calling Convention implementation 1188 //===----------------------------------------------------------------------===// 1189 // StdCall calling convention seems to be standard for many Windows' API 1190 // routines and around. It differs from C calling convention just a little: 1191 // callee should clean up the stack, not caller. Symbols should be also 1192 // decorated in some fancy way :) It doesn't support any vector arguments. 1193 // For info on fast calling convention see Fast Calling Convention (tail call) 1194 // implementation LowerX86_32FastCCCallTo. 1195 1196 /// Determines whether Args, either a set of outgoing arguments to a call, or a 1197 /// set of incoming args of a call, contains an sret pointer that the callee 1198 /// pops 1199 template <typename T> 1200 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args, 1201 const X86Subtarget &Subtarget) { 1202 // Not C++20 (yet), so no concepts available. 1203 static_assert(std::is_same_v<T, ISD::OutputArg> || 1204 std::is_same_v<T, ISD::InputArg>, 1205 "requires ISD::OutputArg or ISD::InputArg"); 1206 1207 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out 1208 // for most compilations. 1209 if (!Subtarget.is32Bit()) 1210 return false; 1211 1212 if (Args.empty()) 1213 return false; 1214 1215 // Most calls do not have an sret argument, check the arg next. 1216 const ISD::ArgFlagsTy &Flags = Args[0].Flags; 1217 if (!Flags.isSRet() || Flags.isInReg()) 1218 return false; 1219 1220 // The MSVCabi does not pop the sret. 1221 if (Subtarget.getTargetTriple().isOSMSVCRT()) 1222 return false; 1223 1224 // MCUs don't pop the sret 1225 if (Subtarget.isTargetMCU()) 1226 return false; 1227 1228 // Callee pops argument 1229 return true; 1230 } 1231 1232 /// Make a copy of an aggregate at address specified by "Src" to address 1233 /// "Dst" with size and alignment information specified by the specific 1234 /// parameter attribute. The copy will be passed as a byval function parameter. 1235 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 1236 SDValue Chain, ISD::ArgFlagsTy Flags, 1237 SelectionDAG &DAG, const SDLoc &dl) { 1238 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); 1239 1240 return DAG.getMemcpy( 1241 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), 1242 /*isVolatile*/ false, /*AlwaysInline=*/true, 1243 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); 1244 } 1245 1246 /// Return true if the calling convention is one that we can guarantee TCO for. 1247 static bool canGuaranteeTCO(CallingConv::ID CC) { 1248 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 1249 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || 1250 CC == CallingConv::Tail || CC == CallingConv::SwiftTail); 1251 } 1252 1253 /// Return true if we might ever do TCO for calls with this calling convention. 1254 static bool mayTailCallThisCC(CallingConv::ID CC) { 1255 switch (CC) { 1256 // C calling conventions: 1257 case CallingConv::C: 1258 case CallingConv::Win64: 1259 case CallingConv::X86_64_SysV: 1260 case CallingConv::PreserveNone: 1261 // Callee pop conventions: 1262 case CallingConv::X86_ThisCall: 1263 case CallingConv::X86_StdCall: 1264 case CallingConv::X86_VectorCall: 1265 case CallingConv::X86_FastCall: 1266 // Swift: 1267 case CallingConv::Swift: 1268 return true; 1269 default: 1270 return canGuaranteeTCO(CC); 1271 } 1272 } 1273 1274 /// Return true if the function is being made into a tailcall target by 1275 /// changing its ABI. 1276 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { 1277 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || 1278 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 1279 } 1280 1281 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 1282 if (!CI->isTailCall()) 1283 return false; 1284 1285 CallingConv::ID CalleeCC = CI->getCallingConv(); 1286 if (!mayTailCallThisCC(CalleeCC)) 1287 return false; 1288 1289 return true; 1290 } 1291 1292 SDValue 1293 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, 1294 const SmallVectorImpl<ISD::InputArg> &Ins, 1295 const SDLoc &dl, SelectionDAG &DAG, 1296 const CCValAssign &VA, 1297 MachineFrameInfo &MFI, unsigned i) const { 1298 // Create the nodes corresponding to a load from this parameter slot. 1299 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1300 bool AlwaysUseMutable = shouldGuaranteeTCO( 1301 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); 1302 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1303 EVT ValVT; 1304 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 1305 1306 // If value is passed by pointer we have address passed instead of the value 1307 // itself. No need to extend if the mask value and location share the same 1308 // absolute size. 1309 bool ExtendedInMem = 1310 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && 1311 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); 1312 1313 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) 1314 ValVT = VA.getLocVT(); 1315 else 1316 ValVT = VA.getValVT(); 1317 1318 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1319 // changed with more analysis. 1320 // In case of tail call optimization mark all arguments mutable. Since they 1321 // could be overwritten by lowering of arguments in case of a tail call. 1322 if (Flags.isByVal()) { 1323 unsigned Bytes = Flags.getByValSize(); 1324 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1325 1326 // FIXME: For now, all byval parameter objects are marked as aliasing. This 1327 // can be improved with deeper analysis. 1328 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, 1329 /*isAliased=*/true); 1330 return DAG.getFrameIndex(FI, PtrVT); 1331 } 1332 1333 EVT ArgVT = Ins[i].ArgVT; 1334 1335 // If this is a vector that has been split into multiple parts, don't elide 1336 // the copy. The layout on the stack may not match the packed in-memory 1337 // layout. 1338 bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector(); 1339 1340 // This is an argument in memory. We might be able to perform copy elision. 1341 // If the argument is passed directly in memory without any extension, then we 1342 // can perform copy elision. Large vector types, for example, may be passed 1343 // indirectly by pointer. 1344 if (Flags.isCopyElisionCandidate() && 1345 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem && 1346 !ScalarizedVector) { 1347 SDValue PartAddr; 1348 if (Ins[i].PartOffset == 0) { 1349 // If this is a one-part value or the first part of a multi-part value, 1350 // create a stack object for the entire argument value type and return a 1351 // load from our portion of it. This assumes that if the first part of an 1352 // argument is in memory, the rest will also be in memory. 1353 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), 1354 /*IsImmutable=*/false); 1355 PartAddr = DAG.getFrameIndex(FI, PtrVT); 1356 return DAG.getLoad( 1357 ValVT, dl, Chain, PartAddr, 1358 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 1359 } 1360 1361 // This is not the first piece of an argument in memory. See if there is 1362 // already a fixed stack object including this offset. If so, assume it 1363 // was created by the PartOffset == 0 branch above and create a load from 1364 // the appropriate offset into it. 1365 int64_t PartBegin = VA.getLocMemOffset(); 1366 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; 1367 int FI = MFI.getObjectIndexBegin(); 1368 for (; MFI.isFixedObjectIndex(FI); ++FI) { 1369 int64_t ObjBegin = MFI.getObjectOffset(FI); 1370 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); 1371 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) 1372 break; 1373 } 1374 if (MFI.isFixedObjectIndex(FI)) { 1375 SDValue Addr = 1376 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), 1377 DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); 1378 return DAG.getLoad(ValVT, dl, Chain, Addr, 1379 MachinePointerInfo::getFixedStack( 1380 DAG.getMachineFunction(), FI, Ins[i].PartOffset)); 1381 } 1382 } 1383 1384 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, 1385 VA.getLocMemOffset(), isImmutable); 1386 1387 // Set SExt or ZExt flag. 1388 if (VA.getLocInfo() == CCValAssign::ZExt) { 1389 MFI.setObjectZExt(FI, true); 1390 } else if (VA.getLocInfo() == CCValAssign::SExt) { 1391 MFI.setObjectSExt(FI, true); 1392 } 1393 1394 MaybeAlign Alignment; 1395 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && 1396 ValVT != MVT::f80) 1397 Alignment = MaybeAlign(4); 1398 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 1399 SDValue Val = DAG.getLoad( 1400 ValVT, dl, Chain, FIN, 1401 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 1402 Alignment); 1403 return ExtendedInMem 1404 ? (VA.getValVT().isVector() 1405 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) 1406 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) 1407 : Val; 1408 } 1409 1410 // FIXME: Get this from tablegen. 1411 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, 1412 const X86Subtarget &Subtarget) { 1413 assert(Subtarget.is64Bit()); 1414 1415 if (Subtarget.isCallingConvWin64(CallConv)) { 1416 static const MCPhysReg GPR64ArgRegsWin64[] = { 1417 X86::RCX, X86::RDX, X86::R8, X86::R9 1418 }; 1419 return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); 1420 } 1421 1422 static const MCPhysReg GPR64ArgRegs64Bit[] = { 1423 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1424 }; 1425 return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); 1426 } 1427 1428 // FIXME: Get this from tablegen. 1429 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, 1430 CallingConv::ID CallConv, 1431 const X86Subtarget &Subtarget) { 1432 assert(Subtarget.is64Bit()); 1433 if (Subtarget.isCallingConvWin64(CallConv)) { 1434 // The XMM registers which might contain var arg parameters are shadowed 1435 // in their paired GPR. So we only need to save the GPR to their home 1436 // slots. 1437 // TODO: __vectorcall will change this. 1438 return std::nullopt; 1439 } 1440 1441 bool isSoftFloat = Subtarget.useSoftFloat(); 1442 if (isSoftFloat || !Subtarget.hasSSE1()) 1443 // Kernel mode asks for SSE to be disabled, so there are no XMM argument 1444 // registers. 1445 return std::nullopt; 1446 1447 static const MCPhysReg XMMArgRegs64Bit[] = { 1448 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1449 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1450 }; 1451 return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); 1452 } 1453 1454 #ifndef NDEBUG 1455 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) { 1456 return llvm::is_sorted( 1457 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool { 1458 return A.getValNo() < B.getValNo(); 1459 }); 1460 } 1461 #endif 1462 1463 namespace { 1464 /// This is a helper class for lowering variable arguments parameters. 1465 class VarArgsLoweringHelper { 1466 public: 1467 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc, 1468 SelectionDAG &DAG, const X86Subtarget &Subtarget, 1469 CallingConv::ID CallConv, CCState &CCInfo) 1470 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget), 1471 TheMachineFunction(DAG.getMachineFunction()), 1472 TheFunction(TheMachineFunction.getFunction()), 1473 FrameInfo(TheMachineFunction.getFrameInfo()), 1474 FrameLowering(*Subtarget.getFrameLowering()), 1475 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv), 1476 CCInfo(CCInfo) {} 1477 1478 // Lower variable arguments parameters. 1479 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize); 1480 1481 private: 1482 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize); 1483 1484 void forwardMustTailParameters(SDValue &Chain); 1485 1486 bool is64Bit() const { return Subtarget.is64Bit(); } 1487 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } 1488 1489 X86MachineFunctionInfo *FuncInfo; 1490 const SDLoc &DL; 1491 SelectionDAG &DAG; 1492 const X86Subtarget &Subtarget; 1493 MachineFunction &TheMachineFunction; 1494 const Function &TheFunction; 1495 MachineFrameInfo &FrameInfo; 1496 const TargetFrameLowering &FrameLowering; 1497 const TargetLowering &TargLowering; 1498 CallingConv::ID CallConv; 1499 CCState &CCInfo; 1500 }; 1501 } // namespace 1502 1503 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( 1504 SDValue &Chain, unsigned StackSize) { 1505 // If the function takes variable number of arguments, make a frame index for 1506 // the start of the first vararg value... for expansion of llvm.va_start. We 1507 // can skip this if there are no va_start calls. 1508 if (is64Bit() || (CallConv != CallingConv::X86_FastCall && 1509 CallConv != CallingConv::X86_ThisCall)) { 1510 FuncInfo->setVarArgsFrameIndex( 1511 FrameInfo.CreateFixedObject(1, StackSize, true)); 1512 } 1513 1514 // 64-bit calling conventions support varargs and register parameters, so we 1515 // have to do extra work to spill them in the prologue. 1516 if (is64Bit()) { 1517 // Find the first unallocated argument registers. 1518 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); 1519 ArrayRef<MCPhysReg> ArgXMMs = 1520 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget); 1521 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); 1522 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); 1523 1524 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && 1525 "SSE register cannot be used when SSE is disabled!"); 1526 1527 if (isWin64()) { 1528 // Get to the caller-allocated home save location. Add 8 to account 1529 // for the return address. 1530 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8; 1531 FuncInfo->setRegSaveFrameIndex( 1532 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1533 // Fixup to set vararg frame on shadow area (4 x i64). 1534 if (NumIntRegs < 4) 1535 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1536 } else { 1537 // For X86-64, if there are vararg parameters that are passed via 1538 // registers, then we must store them to their spots on the stack so 1539 // they may be loaded by dereferencing the result of va_next. 1540 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1541 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); 1542 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject( 1543 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false)); 1544 } 1545 1546 SmallVector<SDValue, 6> 1547 LiveGPRs; // list of SDValue for GPR registers keeping live input value 1548 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers 1549 // keeping live input value 1550 SDValue ALVal; // if applicable keeps SDValue for %al register 1551 1552 // Gather all the live in physical registers. 1553 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { 1554 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass); 1555 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64)); 1556 } 1557 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs); 1558 if (!AvailableXmms.empty()) { 1559 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); 1560 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8); 1561 for (MCPhysReg Reg : AvailableXmms) { 1562 // FastRegisterAllocator spills virtual registers at basic 1563 // block boundary. That leads to usages of xmm registers 1564 // outside of check for %al. Pass physical registers to 1565 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. 1566 TheMachineFunction.getRegInfo().addLiveIn(Reg); 1567 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); 1568 } 1569 } 1570 1571 // Store the integer parameter registers. 1572 SmallVector<SDValue, 8> MemOps; 1573 SDValue RSFIN = 1574 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1575 TargLowering.getPointerTy(DAG.getDataLayout())); 1576 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1577 for (SDValue Val : LiveGPRs) { 1578 SDValue FIN = DAG.getNode(ISD::ADD, DL, 1579 TargLowering.getPointerTy(DAG.getDataLayout()), 1580 RSFIN, DAG.getIntPtrConstant(Offset, DL)); 1581 SDValue Store = 1582 DAG.getStore(Val.getValue(1), DL, Val, FIN, 1583 MachinePointerInfo::getFixedStack( 1584 DAG.getMachineFunction(), 1585 FuncInfo->getRegSaveFrameIndex(), Offset)); 1586 MemOps.push_back(Store); 1587 Offset += 8; 1588 } 1589 1590 // Now store the XMM (fp + vector) parameter registers. 1591 if (!LiveXMMRegs.empty()) { 1592 SmallVector<SDValue, 12> SaveXMMOps; 1593 SaveXMMOps.push_back(Chain); 1594 SaveXMMOps.push_back(ALVal); 1595 SaveXMMOps.push_back(RSFIN); 1596 SaveXMMOps.push_back( 1597 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); 1598 llvm::append_range(SaveXMMOps, LiveXMMRegs); 1599 MachineMemOperand *StoreMMO = 1600 DAG.getMachineFunction().getMachineMemOperand( 1601 MachinePointerInfo::getFixedStack( 1602 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), 1603 Offset), 1604 MachineMemOperand::MOStore, 128, Align(16)); 1605 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS, 1606 DL, DAG.getVTList(MVT::Other), 1607 SaveXMMOps, MVT::i8, StoreMMO)); 1608 } 1609 1610 if (!MemOps.empty()) 1611 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 1612 } 1613 } 1614 1615 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { 1616 // Find the largest legal vector type. 1617 MVT VecVT = MVT::Other; 1618 // FIXME: Only some x86_32 calling conventions support AVX512. 1619 if (Subtarget.useAVX512Regs() && 1620 (is64Bit() || (CallConv == CallingConv::X86_VectorCall || 1621 CallConv == CallingConv::Intel_OCL_BI))) 1622 VecVT = MVT::v16f32; 1623 else if (Subtarget.hasAVX()) 1624 VecVT = MVT::v8f32; 1625 else if (Subtarget.hasSSE2()) 1626 VecVT = MVT::v4f32; 1627 1628 // We forward some GPRs and some vector types. 1629 SmallVector<MVT, 2> RegParmTypes; 1630 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32; 1631 RegParmTypes.push_back(IntVT); 1632 if (VecVT != MVT::Other) 1633 RegParmTypes.push_back(VecVT); 1634 1635 // Compute the set of forwarded registers. The rest are scratch. 1636 SmallVectorImpl<ForwardedRegister> &Forwards = 1637 FuncInfo->getForwardedMustTailRegParms(); 1638 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); 1639 1640 // Forward AL for SysV x86_64 targets, since it is used for varargs. 1641 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) { 1642 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); 1643 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); 1644 } 1645 1646 // Copy all forwards from physical to virtual registers. 1647 for (ForwardedRegister &FR : Forwards) { 1648 // FIXME: Can we use a less constrained schedule? 1649 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT); 1650 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister( 1651 TargLowering.getRegClassFor(FR.VT)); 1652 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal); 1653 } 1654 } 1655 1656 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain, 1657 unsigned StackSize) { 1658 // Set FrameIndex to the 0xAAAAAAA value to mark unset state. 1659 // If necessary, it would be set into the correct value later. 1660 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1661 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1662 1663 if (FrameInfo.hasVAStart()) 1664 createVarArgAreaAndStoreRegisters(Chain, StackSize); 1665 1666 if (FrameInfo.hasMustTailInVarArgFunc()) 1667 forwardMustTailParameters(Chain); 1668 } 1669 1670 SDValue X86TargetLowering::LowerFormalArguments( 1671 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, 1672 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1673 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1674 MachineFunction &MF = DAG.getMachineFunction(); 1675 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1676 1677 const Function &F = MF.getFunction(); 1678 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && 1679 F.getName() == "main") 1680 FuncInfo->setForceFramePointer(true); 1681 1682 MachineFrameInfo &MFI = MF.getFrameInfo(); 1683 bool Is64Bit = Subtarget.is64Bit(); 1684 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 1685 1686 assert( 1687 !(IsVarArg && canGuaranteeTCO(CallConv)) && 1688 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); 1689 1690 // Assign locations to all of the incoming arguments. 1691 SmallVector<CCValAssign, 16> ArgLocs; 1692 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 1693 1694 // Allocate shadow area for Win64. 1695 if (IsWin64) 1696 CCInfo.AllocateStack(32, Align(8)); 1697 1698 CCInfo.AnalyzeArguments(Ins, CC_X86); 1699 1700 // In vectorcall calling convention a second pass is required for the HVA 1701 // types. 1702 if (CallingConv::X86_VectorCall == CallConv) { 1703 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); 1704 } 1705 1706 // The next loop assumes that the locations are in the same order of the 1707 // input arguments. 1708 assert(isSortedByValueNo(ArgLocs) && 1709 "Argument Location list must be sorted before lowering"); 1710 1711 SDValue ArgValue; 1712 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; 1713 ++I, ++InsIndex) { 1714 assert(InsIndex < Ins.size() && "Invalid Ins index"); 1715 CCValAssign &VA = ArgLocs[I]; 1716 1717 if (VA.isRegLoc()) { 1718 EVT RegVT = VA.getLocVT(); 1719 if (VA.needsCustom()) { 1720 assert( 1721 VA.getValVT() == MVT::v64i1 && 1722 "Currently the only custom case is when we split v64i1 to 2 regs"); 1723 1724 // v64i1 values, in regcall calling convention, that are 1725 // compiled to 32 bit arch, are split up into two registers. 1726 ArgValue = 1727 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); 1728 } else { 1729 const TargetRegisterClass *RC; 1730 if (RegVT == MVT::i8) 1731 RC = &X86::GR8RegClass; 1732 else if (RegVT == MVT::i16) 1733 RC = &X86::GR16RegClass; 1734 else if (RegVT == MVT::i32) 1735 RC = &X86::GR32RegClass; 1736 else if (Is64Bit && RegVT == MVT::i64) 1737 RC = &X86::GR64RegClass; 1738 else if (RegVT == MVT::f16) 1739 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; 1740 else if (RegVT == MVT::f32) 1741 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; 1742 else if (RegVT == MVT::f64) 1743 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; 1744 else if (RegVT == MVT::f80) 1745 RC = &X86::RFP80RegClass; 1746 else if (RegVT == MVT::f128) 1747 RC = &X86::VR128RegClass; 1748 else if (RegVT.is512BitVector()) 1749 RC = &X86::VR512RegClass; 1750 else if (RegVT.is256BitVector()) 1751 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; 1752 else if (RegVT.is128BitVector()) 1753 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; 1754 else if (RegVT == MVT::x86mmx) 1755 RC = &X86::VR64RegClass; 1756 else if (RegVT == MVT::v1i1) 1757 RC = &X86::VK1RegClass; 1758 else if (RegVT == MVT::v8i1) 1759 RC = &X86::VK8RegClass; 1760 else if (RegVT == MVT::v16i1) 1761 RC = &X86::VK16RegClass; 1762 else if (RegVT == MVT::v32i1) 1763 RC = &X86::VK32RegClass; 1764 else if (RegVT == MVT::v64i1) 1765 RC = &X86::VK64RegClass; 1766 else 1767 llvm_unreachable("Unknown argument type!"); 1768 1769 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 1770 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1771 } 1772 1773 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1774 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1775 // right size. 1776 if (VA.getLocInfo() == CCValAssign::SExt) 1777 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1778 DAG.getValueType(VA.getValVT())); 1779 else if (VA.getLocInfo() == CCValAssign::ZExt) 1780 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1781 DAG.getValueType(VA.getValVT())); 1782 else if (VA.getLocInfo() == CCValAssign::BCvt) 1783 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); 1784 1785 if (VA.isExtInLoc()) { 1786 // Handle MMX values passed in XMM regs. 1787 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) 1788 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 1789 else if (VA.getValVT().isVector() && 1790 VA.getValVT().getScalarType() == MVT::i1 && 1791 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || 1792 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { 1793 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 1794 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); 1795 } else 1796 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1797 } 1798 } else { 1799 assert(VA.isMemLoc()); 1800 ArgValue = 1801 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); 1802 } 1803 1804 // If value is passed via pointer - do a load. 1805 if (VA.getLocInfo() == CCValAssign::Indirect && 1806 !(Ins[I].Flags.isByVal() && VA.isRegLoc())) { 1807 ArgValue = 1808 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); 1809 } 1810 1811 InVals.push_back(ArgValue); 1812 } 1813 1814 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 1815 if (Ins[I].Flags.isSwiftAsync()) { 1816 auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); 1817 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) 1818 X86FI->setHasSwiftAsyncContext(true); 1819 else { 1820 int PtrSize = Subtarget.is64Bit() ? 8 : 4; 1821 int FI = 1822 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize), false); 1823 X86FI->setSwiftAsyncContextFrameIdx(FI); 1824 SDValue St = DAG.getStore( 1825 DAG.getEntryNode(), dl, InVals[I], 1826 DAG.getFrameIndex(FI, PtrSize == 8 ? MVT::i64 : MVT::i32), 1827 MachinePointerInfo::getFixedStack(MF, FI)); 1828 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain); 1829 } 1830 } 1831 1832 // Swift calling convention does not require we copy the sret argument 1833 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. 1834 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail) 1835 continue; 1836 1837 // All x86 ABIs require that for returning structs by value we copy the 1838 // sret argument into %rax/%eax (depending on ABI) for the return. Save 1839 // the argument into a virtual register so that we can access it from the 1840 // return points. 1841 if (Ins[I].Flags.isSRet()) { 1842 assert(!FuncInfo->getSRetReturnReg() && 1843 "SRet return has already been set"); 1844 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 1845 Register Reg = 1846 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 1847 FuncInfo->setSRetReturnReg(Reg); 1848 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); 1849 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1850 break; 1851 } 1852 } 1853 1854 unsigned StackSize = CCInfo.getStackSize(); 1855 // Align stack specially for tail calls. 1856 if (shouldGuaranteeTCO(CallConv, 1857 MF.getTarget().Options.GuaranteedTailCallOpt)) 1858 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1859 1860 if (IsVarArg) 1861 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo) 1862 .lowerVarArgsParameters(Chain, StackSize); 1863 1864 // Some CCs need callee pop. 1865 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg, 1866 MF.getTarget().Options.GuaranteedTailCallOpt)) { 1867 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1868 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { 1869 // X86 interrupts must pop the error code (and the alignment padding) if 1870 // present. 1871 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); 1872 } else { 1873 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1874 // If this is an sret function, the return should pop the hidden pointer. 1875 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget)) 1876 FuncInfo->setBytesToPopOnReturn(4); 1877 } 1878 1879 if (!Is64Bit) { 1880 // RegSaveFrameIndex is X86-64 only. 1881 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1882 } 1883 1884 FuncInfo->setArgumentStackSize(StackSize); 1885 1886 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { 1887 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); 1888 if (Personality == EHPersonality::CoreCLR) { 1889 assert(Is64Bit); 1890 // TODO: Add a mechanism to frame lowering that will allow us to indicate 1891 // that we'd prefer this slot be allocated towards the bottom of the frame 1892 // (i.e. near the stack pointer after allocating the frame). Every 1893 // funclet needs a copy of this slot in its (mostly empty) frame, and the 1894 // offset from the bottom of this and each funclet's frame must be the 1895 // same, so the size of funclets' (mostly empty) frames is dictated by 1896 // how far this slot is from the bottom (since they allocate just enough 1897 // space to accommodate holding this slot at the correct offset). 1898 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); 1899 EHInfo->PSPSymFrameIdx = PSPSymFI; 1900 } 1901 } 1902 1903 if (shouldDisableArgRegFromCSR(CallConv) || 1904 F.hasFnAttribute("no_caller_saved_registers")) { 1905 MachineRegisterInfo &MRI = MF.getRegInfo(); 1906 for (std::pair<Register, Register> Pair : MRI.liveins()) 1907 MRI.disableCalleeSavedRegister(Pair.first); 1908 } 1909 1910 if (CallingConv::PreserveNone == CallConv) 1911 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 1912 if (Ins[I].Flags.isSwiftSelf() || Ins[I].Flags.isSwiftAsync() || 1913 Ins[I].Flags.isSwiftError()) { 1914 errorUnsupported(DAG, dl, 1915 "Swift attributes can't be used with preserve_none"); 1916 break; 1917 } 1918 } 1919 1920 return Chain; 1921 } 1922 1923 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1924 SDValue Arg, const SDLoc &dl, 1925 SelectionDAG &DAG, 1926 const CCValAssign &VA, 1927 ISD::ArgFlagsTy Flags, 1928 bool isByVal) const { 1929 unsigned LocMemOffset = VA.getLocMemOffset(); 1930 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1931 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1932 StackPtr, PtrOff); 1933 if (isByVal) 1934 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1935 1936 MaybeAlign Alignment; 1937 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && 1938 Arg.getSimpleValueType() != MVT::f80) 1939 Alignment = MaybeAlign(4); 1940 return DAG.getStore( 1941 Chain, dl, Arg, PtrOff, 1942 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), 1943 Alignment); 1944 } 1945 1946 /// Emit a load of return address if tail call 1947 /// optimization is performed and it is required. 1948 SDValue X86TargetLowering::EmitTailCallLoadRetAddr( 1949 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, 1950 bool Is64Bit, int FPDiff, const SDLoc &dl) const { 1951 // Adjust the Return address stack slot. 1952 EVT VT = getPointerTy(DAG.getDataLayout()); 1953 OutRetAddr = getReturnAddressFrameIndex(DAG); 1954 1955 // Load the "old" Return address. 1956 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); 1957 return SDValue(OutRetAddr.getNode(), 1); 1958 } 1959 1960 /// Emit a store of the return address if tail call 1961 /// optimization is performed and it is required (FPDiff!=0). 1962 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, 1963 SDValue Chain, SDValue RetAddrFrIdx, 1964 EVT PtrVT, unsigned SlotSize, 1965 int FPDiff, const SDLoc &dl) { 1966 // Store the return address to the appropriate stack slot. 1967 if (!FPDiff) return Chain; 1968 // Calculate the new stack slot for the return address. 1969 int NewReturnAddrFI = 1970 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 1971 false); 1972 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 1973 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1974 MachinePointerInfo::getFixedStack( 1975 DAG.getMachineFunction(), NewReturnAddrFI)); 1976 return Chain; 1977 } 1978 1979 /// Returns a vector_shuffle mask for an movs{s|d}, movd 1980 /// operation of specified width. 1981 SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, 1982 SDValue V1, SDValue V2) const { 1983 unsigned NumElems = VT.getVectorNumElements(); 1984 SmallVector<int, 8> Mask; 1985 Mask.push_back(NumElems); 1986 for (unsigned i = 1; i != NumElems; ++i) 1987 Mask.push_back(i); 1988 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); 1989 } 1990 1991 SDValue 1992 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1993 SmallVectorImpl<SDValue> &InVals) const { 1994 SelectionDAG &DAG = CLI.DAG; 1995 SDLoc &dl = CLI.DL; 1996 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1997 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1998 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1999 SDValue Chain = CLI.Chain; 2000 SDValue Callee = CLI.Callee; 2001 CallingConv::ID CallConv = CLI.CallConv; 2002 bool &isTailCall = CLI.IsTailCall; 2003 bool isVarArg = CLI.IsVarArg; 2004 const auto *CB = CLI.CB; 2005 2006 MachineFunction &MF = DAG.getMachineFunction(); 2007 bool Is64Bit = Subtarget.is64Bit(); 2008 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); 2009 bool IsSibcall = false; 2010 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || 2011 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; 2012 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget); 2013 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2014 bool HasNCSR = (CB && isa<CallInst>(CB) && 2015 CB->hasFnAttr("no_caller_saved_registers")); 2016 bool HasNoCfCheck = (CB && CB->doesNoCfCheck()); 2017 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall()); 2018 bool IsCFICall = IsIndirectCall && CLI.CFIType; 2019 const Module *M = MF.getFunction().getParent(); 2020 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); 2021 2022 MachineFunction::CallSiteInfo CSInfo; 2023 if (CallConv == CallingConv::X86_INTR) 2024 report_fatal_error("X86 interrupts may not be called directly"); 2025 2026 // Analyze operands of the call, assigning locations to each operand. 2027 SmallVector<CCValAssign, 16> ArgLocs; 2028 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 2029 2030 // Allocate shadow area for Win64. 2031 if (IsWin64) 2032 CCInfo.AllocateStack(32, Align(8)); 2033 2034 CCInfo.AnalyzeArguments(Outs, CC_X86); 2035 2036 // In vectorcall calling convention a second pass is required for the HVA 2037 // types. 2038 if (CallingConv::X86_VectorCall == CallConv) { 2039 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); 2040 } 2041 2042 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); 2043 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) { 2044 // If we are using a GOT, disable tail calls to external symbols with 2045 // default visibility. Tail calling such a symbol requires using a GOT 2046 // relocation, which forces early binding of the symbol. This breaks code 2047 // that require lazy function symbol resolution. Using musttail or 2048 // GuaranteedTailCallOpt will override this. 2049 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2050 if (!G || (!G->getGlobal()->hasLocalLinkage() && 2051 G->getGlobal()->hasDefaultVisibility())) 2052 isTailCall = false; 2053 } 2054 2055 if (isTailCall && !IsMustTail) { 2056 // Check if it's really possible to do a tail call. 2057 isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, 2058 IsCalleePopSRet); 2059 2060 // Sibcalls are automatically detected tailcalls which do not require 2061 // ABI changes. 2062 if (!IsGuaranteeTCO && isTailCall) 2063 IsSibcall = true; 2064 2065 if (isTailCall) 2066 ++NumTailCalls; 2067 } 2068 2069 if (IsMustTail && !isTailCall) 2070 report_fatal_error("failed to perform tail call elimination on a call " 2071 "site marked musttail"); 2072 2073 assert(!(isVarArg && canGuaranteeTCO(CallConv)) && 2074 "Var args not supported with calling convention fastcc, ghc or hipe"); 2075 2076 // Get a count of how many bytes are to be pushed on the stack. 2077 unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); 2078 if (IsSibcall) 2079 // This is a sibcall. The memory operands are available in caller's 2080 // own caller's stack. 2081 NumBytes = 0; 2082 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) 2083 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2084 2085 int FPDiff = 0; 2086 if (isTailCall && 2087 shouldGuaranteeTCO(CallConv, 2088 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2089 // Lower arguments at fp - stackoffset + fpdiff. 2090 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2091 2092 FPDiff = NumBytesCallerPushed - NumBytes; 2093 2094 // Set the delta of movement of the returnaddr stackslot. 2095 // But only set if delta is greater than previous delta. 2096 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2097 X86Info->setTCReturnAddrDelta(FPDiff); 2098 } 2099 2100 unsigned NumBytesToPush = NumBytes; 2101 unsigned NumBytesToPop = NumBytes; 2102 2103 // If we have an inalloca argument, all stack space has already been allocated 2104 // for us and be right at the top of the stack. We don't support multiple 2105 // arguments passed in memory when using inalloca. 2106 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { 2107 NumBytesToPush = 0; 2108 if (!ArgLocs.back().isMemLoc()) 2109 report_fatal_error("cannot use inalloca attribute on a register " 2110 "parameter"); 2111 if (ArgLocs.back().getLocMemOffset() != 0) 2112 report_fatal_error("any parameter with the inalloca attribute must be " 2113 "the only memory argument"); 2114 } else if (CLI.IsPreallocated) { 2115 assert(ArgLocs.back().isMemLoc() && 2116 "cannot use preallocated attribute on a register " 2117 "parameter"); 2118 SmallVector<size_t, 4> PreallocatedOffsets; 2119 for (size_t i = 0; i < CLI.OutVals.size(); ++i) { 2120 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { 2121 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); 2122 } 2123 } 2124 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); 2125 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB); 2126 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes); 2127 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); 2128 NumBytesToPush = 0; 2129 } 2130 2131 if (!IsSibcall && !IsMustTail) 2132 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, 2133 NumBytes - NumBytesToPush, dl); 2134 2135 SDValue RetAddrFrIdx; 2136 // Load return address for tail calls. 2137 if (isTailCall && FPDiff) 2138 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2139 Is64Bit, FPDiff, dl); 2140 2141 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; 2142 SmallVector<SDValue, 8> MemOpChains; 2143 SDValue StackPtr; 2144 2145 // The next loop assumes that the locations are in the same order of the 2146 // input arguments. 2147 assert(isSortedByValueNo(ArgLocs) && 2148 "Argument Location list must be sorted before lowering"); 2149 2150 // Walk the register/memloc assignments, inserting copies/loads. In the case 2151 // of tail call optimization arguments are handle later. 2152 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 2153 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; 2154 ++I, ++OutIndex) { 2155 assert(OutIndex < Outs.size() && "Invalid Out index"); 2156 // Skip inalloca/preallocated arguments, they have already been written. 2157 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; 2158 if (Flags.isInAlloca() || Flags.isPreallocated()) 2159 continue; 2160 2161 CCValAssign &VA = ArgLocs[I]; 2162 EVT RegVT = VA.getLocVT(); 2163 SDValue Arg = OutVals[OutIndex]; 2164 bool isByVal = Flags.isByVal(); 2165 2166 // Promote the value if needed. 2167 switch (VA.getLocInfo()) { 2168 default: llvm_unreachable("Unknown loc info!"); 2169 case CCValAssign::Full: break; 2170 case CCValAssign::SExt: 2171 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2172 break; 2173 case CCValAssign::ZExt: 2174 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2175 break; 2176 case CCValAssign::AExt: 2177 if (Arg.getValueType().isVector() && 2178 Arg.getValueType().getVectorElementType() == MVT::i1) 2179 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); 2180 else if (RegVT.is128BitVector()) { 2181 // Special case: passing MMX values in XMM registers. 2182 Arg = DAG.getBitcast(MVT::i64, Arg); 2183 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2184 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2185 } else 2186 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2187 break; 2188 case CCValAssign::BCvt: 2189 Arg = DAG.getBitcast(RegVT, Arg); 2190 break; 2191 case CCValAssign::Indirect: { 2192 if (isByVal) { 2193 // Memcpy the argument to a temporary stack slot to prevent 2194 // the caller from seeing any modifications the callee may make 2195 // as guaranteed by the `byval` attribute. 2196 int FrameIdx = MF.getFrameInfo().CreateStackObject( 2197 Flags.getByValSize(), 2198 std::max(Align(16), Flags.getNonZeroByValAlign()), false); 2199 SDValue StackSlot = 2200 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); 2201 Chain = 2202 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); 2203 // From now on treat this as a regular pointer 2204 Arg = StackSlot; 2205 isByVal = false; 2206 } else { 2207 // Store the argument. 2208 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2209 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2210 Chain = DAG.getStore( 2211 Chain, dl, Arg, SpillSlot, 2212 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 2213 Arg = SpillSlot; 2214 } 2215 break; 2216 } 2217 } 2218 2219 if (VA.needsCustom()) { 2220 assert(VA.getValVT() == MVT::v64i1 && 2221 "Currently the only custom case is when we split v64i1 to 2 regs"); 2222 // Split v64i1 value into two registers 2223 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); 2224 } else if (VA.isRegLoc()) { 2225 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2226 const TargetOptions &Options = DAG.getTarget().Options; 2227 if (Options.EmitCallSiteInfo) 2228 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), I); 2229 if (isVarArg && IsWin64) { 2230 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2231 // shadow reg if callee is a varargs function. 2232 Register ShadowReg; 2233 switch (VA.getLocReg()) { 2234 case X86::XMM0: ShadowReg = X86::RCX; break; 2235 case X86::XMM1: ShadowReg = X86::RDX; break; 2236 case X86::XMM2: ShadowReg = X86::R8; break; 2237 case X86::XMM3: ShadowReg = X86::R9; break; 2238 } 2239 if (ShadowReg) 2240 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2241 } 2242 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2243 assert(VA.isMemLoc()); 2244 if (!StackPtr.getNode()) 2245 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2246 getPointerTy(DAG.getDataLayout())); 2247 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2248 dl, DAG, VA, Flags, isByVal)); 2249 } 2250 } 2251 2252 if (!MemOpChains.empty()) 2253 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2254 2255 if (Subtarget.isPICStyleGOT()) { 2256 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2257 // GOT pointer (except regcall). 2258 if (!isTailCall) { 2259 // Indirect call with RegCall calling convertion may use up all the 2260 // general registers, so it is not suitable to bind EBX reister for 2261 // GOT address, just let register allocator handle it. 2262 if (CallConv != CallingConv::X86_RegCall) 2263 RegsToPass.push_back(std::make_pair( 2264 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), 2265 getPointerTy(DAG.getDataLayout())))); 2266 } else { 2267 // If we are tail calling and generating PIC/GOT style code load the 2268 // address of the callee into ECX. The value in ecx is used as target of 2269 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2270 // for tail calls on PIC/GOT architectures. Normally we would just put the 2271 // address of GOT into ebx and then call target@PLT. But for tail calls 2272 // ebx would be restored (since ebx is callee saved) before jumping to the 2273 // target@PLT. 2274 2275 // Note: The actual moving to ECX is done further down. 2276 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2277 if (G && !G->getGlobal()->hasLocalLinkage() && 2278 G->getGlobal()->hasDefaultVisibility()) 2279 Callee = LowerGlobalAddress(Callee, DAG); 2280 else if (isa<ExternalSymbolSDNode>(Callee)) 2281 Callee = LowerExternalSymbol(Callee, DAG); 2282 } 2283 } 2284 2285 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail && 2286 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) { 2287 // From AMD64 ABI document: 2288 // For calls that may call functions that use varargs or stdargs 2289 // (prototype-less calls or calls to functions containing ellipsis (...) in 2290 // the declaration) %al is used as hidden argument to specify the number 2291 // of SSE registers used. The contents of %al do not need to match exactly 2292 // the number of registers, but must be an ubound on the number of SSE 2293 // registers used and is in the range 0 - 8 inclusive. 2294 2295 // Count the number of XMM registers allocated. 2296 static const MCPhysReg XMMArgRegs[] = { 2297 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2298 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2299 }; 2300 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); 2301 assert((Subtarget.hasSSE1() || !NumXMMRegs) 2302 && "SSE registers cannot be used when SSE is disabled"); 2303 RegsToPass.push_back(std::make_pair(Register(X86::AL), 2304 DAG.getConstant(NumXMMRegs, dl, 2305 MVT::i8))); 2306 } 2307 2308 if (isVarArg && IsMustTail) { 2309 const auto &Forwards = X86Info->getForwardedMustTailRegParms(); 2310 for (const auto &F : Forwards) { 2311 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 2312 RegsToPass.push_back(std::make_pair(F.PReg, Val)); 2313 } 2314 } 2315 2316 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls 2317 // don't need this because the eligibility check rejects calls that require 2318 // shuffling arguments passed in memory. 2319 if (!IsSibcall && isTailCall) { 2320 // Force all the incoming stack arguments to be loaded from the stack 2321 // before any new outgoing arguments are stored to the stack, because the 2322 // outgoing stack slots may alias the incoming argument stack slots, and 2323 // the alias isn't otherwise explicit. This is slightly more conservative 2324 // than necessary, because it means that each store effectively depends 2325 // on every argument instead of just those arguments it would clobber. 2326 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2327 2328 SmallVector<SDValue, 8> MemOpChains2; 2329 SDValue FIN; 2330 int FI = 0; 2331 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; 2332 ++I, ++OutsIndex) { 2333 CCValAssign &VA = ArgLocs[I]; 2334 2335 if (VA.isRegLoc()) { 2336 if (VA.needsCustom()) { 2337 assert((CallConv == CallingConv::X86_RegCall) && 2338 "Expecting custom case only in regcall calling convention"); 2339 // This means that we are in special case where one argument was 2340 // passed through two register locations - Skip the next location 2341 ++I; 2342 } 2343 2344 continue; 2345 } 2346 2347 assert(VA.isMemLoc()); 2348 SDValue Arg = OutVals[OutsIndex]; 2349 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; 2350 // Skip inalloca/preallocated arguments. They don't require any work. 2351 if (Flags.isInAlloca() || Flags.isPreallocated()) 2352 continue; 2353 // Create frame index. 2354 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2355 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2356 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 2357 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2358 2359 if (Flags.isByVal()) { 2360 // Copy relative to framepointer. 2361 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); 2362 if (!StackPtr.getNode()) 2363 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2364 getPointerTy(DAG.getDataLayout())); 2365 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2366 StackPtr, Source); 2367 2368 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2369 ArgChain, 2370 Flags, DAG, dl)); 2371 } else { 2372 // Store relative to framepointer. 2373 MemOpChains2.push_back(DAG.getStore( 2374 ArgChain, dl, Arg, FIN, 2375 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); 2376 } 2377 } 2378 2379 if (!MemOpChains2.empty()) 2380 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 2381 2382 // Store the return address to the appropriate stack slot. 2383 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2384 getPointerTy(DAG.getDataLayout()), 2385 RegInfo->getSlotSize(), FPDiff, dl); 2386 } 2387 2388 // Build a sequence of copy-to-reg nodes chained together with token chain 2389 // and glue operands which copy the outgoing args into registers. 2390 SDValue InGlue; 2391 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2392 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2393 RegsToPass[i].second, InGlue); 2394 InGlue = Chain.getValue(1); 2395 } 2396 2397 if (DAG.getTarget().getCodeModel() == CodeModel::Large) { 2398 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2399 // In the 64-bit large code model, we have to make all calls 2400 // through a register, since the call instruction's 32-bit 2401 // pc-relative offset may not be large enough to hold the whole 2402 // address. 2403 } else if (Callee->getOpcode() == ISD::GlobalAddress || 2404 Callee->getOpcode() == ISD::ExternalSymbol) { 2405 // Lower direct calls to global addresses and external symbols. Setting 2406 // ForCall to true here has the effect of removing WrapperRIP when possible 2407 // to allow direct calls to be selected without first materializing the 2408 // address into a register. 2409 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); 2410 } else if (Subtarget.isTarget64BitILP32() && 2411 Callee.getValueType() == MVT::i32) { 2412 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI 2413 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); 2414 } 2415 2416 // Returns a chain & a glue for retval copy to use. 2417 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2418 SmallVector<SDValue, 8> Ops; 2419 2420 if (!IsSibcall && isTailCall && !IsMustTail) { 2421 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl); 2422 InGlue = Chain.getValue(1); 2423 } 2424 2425 Ops.push_back(Chain); 2426 Ops.push_back(Callee); 2427 2428 if (isTailCall) 2429 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32)); 2430 2431 // Add argument registers to the end of the list so that they are known live 2432 // into the call. 2433 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2434 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2435 RegsToPass[i].second.getValueType())); 2436 2437 // Add a register mask operand representing the call-preserved registers. 2438 const uint32_t *Mask = [&]() { 2439 auto AdaptedCC = CallConv; 2440 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists), 2441 // use X86_INTR calling convention because it has the same CSR mask 2442 // (same preserved registers). 2443 if (HasNCSR) 2444 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR; 2445 // If NoCalleeSavedRegisters is requested, than use GHC since it happens 2446 // to use the CSR_NoRegs_RegMask. 2447 if (CB && CB->hasFnAttr("no_callee_saved_registers")) 2448 AdaptedCC = (CallingConv::ID)CallingConv::GHC; 2449 return RegInfo->getCallPreservedMask(MF, AdaptedCC); 2450 }(); 2451 assert(Mask && "Missing call preserved mask for calling convention"); 2452 2453 // If this is an invoke in a 32-bit function using a funclet-based 2454 // personality, assume the function clobbers all registers. If an exception 2455 // is thrown, the runtime will not restore CSRs. 2456 // FIXME: Model this more precisely so that we can register allocate across 2457 // the normal edge and spill and fill across the exceptional edge. 2458 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) { 2459 const Function &CallerFn = MF.getFunction(); 2460 EHPersonality Pers = 2461 CallerFn.hasPersonalityFn() 2462 ? classifyEHPersonality(CallerFn.getPersonalityFn()) 2463 : EHPersonality::Unknown; 2464 if (isFuncletEHPersonality(Pers)) 2465 Mask = RegInfo->getNoPreservedMask(); 2466 } 2467 2468 // Define a new register mask from the existing mask. 2469 uint32_t *RegMask = nullptr; 2470 2471 // In some calling conventions we need to remove the used physical registers 2472 // from the reg mask. Create a new RegMask for such calling conventions. 2473 // RegMask for calling conventions that disable only return registers (e.g. 2474 // preserve_most) will be modified later in LowerCallResult. 2475 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR; 2476 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) { 2477 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 2478 2479 // Allocate a new Reg Mask and copy Mask. 2480 RegMask = MF.allocateRegMask(); 2481 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); 2482 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); 2483 2484 // Make sure all sub registers of the argument registers are reset 2485 // in the RegMask. 2486 if (ShouldDisableArgRegs) { 2487 for (auto const &RegPair : RegsToPass) 2488 for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first)) 2489 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); 2490 } 2491 2492 // Create the RegMask Operand according to our updated mask. 2493 Ops.push_back(DAG.getRegisterMask(RegMask)); 2494 } else { 2495 // Create the RegMask Operand according to the static mask. 2496 Ops.push_back(DAG.getRegisterMask(Mask)); 2497 } 2498 2499 if (InGlue.getNode()) 2500 Ops.push_back(InGlue); 2501 2502 if (isTailCall) { 2503 // We used to do: 2504 //// If this is the first return lowered for this function, add the regs 2505 //// to the liveout set for the function. 2506 // This isn't right, although it's probably harmless on x86; liveouts 2507 // should be computed from returns not tail calls. Consider a void 2508 // function making a tail call to a function returning int. 2509 MF.getFrameInfo().setHasTailCall(); 2510 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); 2511 2512 if (IsCFICall) 2513 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 2514 2515 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); 2516 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2517 return Ret; 2518 } 2519 2520 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { 2521 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); 2522 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { 2523 // Calls with a "clang.arc.attachedcall" bundle are special. They should be 2524 // expanded to the call, directly followed by a special marker sequence and 2525 // a call to a ObjC library function. Use the CALL_RVMARKER to do that. 2526 assert(!isTailCall && 2527 "tail calls cannot be marked with clang.arc.attachedcall"); 2528 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"); 2529 2530 // Add a target global address for the retainRV/claimRV runtime function 2531 // just before the call target. 2532 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); 2533 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2534 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT); 2535 Ops.insert(Ops.begin() + 1, GA); 2536 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops); 2537 } else { 2538 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); 2539 } 2540 2541 if (IsCFICall) 2542 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 2543 2544 InGlue = Chain.getValue(1); 2545 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2546 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2547 2548 // Save heapallocsite metadata. 2549 if (CLI.CB) 2550 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite")) 2551 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); 2552 2553 // Create the CALLSEQ_END node. 2554 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing. 2555 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2556 DAG.getTarget().Options.GuaranteedTailCallOpt)) 2557 NumBytesForCalleeToPop = NumBytes; // Callee pops everything 2558 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet) 2559 // If this call passes a struct-return pointer, the callee 2560 // pops that struct pointer. 2561 NumBytesForCalleeToPop = 4; 2562 2563 // Returns a glue for retval copy to use. 2564 if (!IsSibcall) { 2565 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop, 2566 InGlue, dl); 2567 InGlue = Chain.getValue(1); 2568 } 2569 2570 if (CallingConv::PreserveNone == CallConv) 2571 for (unsigned I = 0, E = Outs.size(); I != E; ++I) { 2572 if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftAsync() || 2573 Outs[I].Flags.isSwiftError()) { 2574 errorUnsupported(DAG, dl, 2575 "Swift attributes can't be used with preserve_none"); 2576 break; 2577 } 2578 } 2579 2580 // Handle result values, copying them out of physregs into vregs that we 2581 // return. 2582 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG, 2583 InVals, RegMask); 2584 } 2585 2586 //===----------------------------------------------------------------------===// 2587 // Fast Calling Convention (tail call) implementation 2588 //===----------------------------------------------------------------------===// 2589 2590 // Like std call, callee cleans arguments, convention except that ECX is 2591 // reserved for storing the tail called function address. Only 2 registers are 2592 // free for argument passing (inreg). Tail call optimization is performed 2593 // provided: 2594 // * tailcallopt is enabled 2595 // * caller/callee are fastcc 2596 // On X86_64 architecture with GOT-style position independent code only local 2597 // (within module) calls are supported at the moment. 2598 // To keep the stack aligned according to platform abi the function 2599 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 2600 // of stack alignment. (Dynamic linkers need this - Darwin's dyld for example) 2601 // If a tail called function callee has more arguments than the caller the 2602 // caller needs to make sure that there is room to move the RETADDR to. This is 2603 // achieved by reserving an area the size of the argument delta right after the 2604 // original RETADDR, but before the saved framepointer or the spilled registers 2605 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2606 // stack layout: 2607 // arg1 2608 // arg2 2609 // RETADDR 2610 // [ new RETADDR 2611 // move area ] 2612 // (possible EBP) 2613 // ESI 2614 // EDI 2615 // local1 .. 2616 2617 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align 2618 /// requirement. 2619 unsigned 2620 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, 2621 SelectionDAG &DAG) const { 2622 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); 2623 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); 2624 assert(StackSize % SlotSize == 0 && 2625 "StackSize must be a multiple of SlotSize"); 2626 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; 2627 } 2628 2629 /// Return true if the given stack call argument is already available in the 2630 /// same position (relatively) of the caller's incoming argument stack. 2631 static 2632 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2633 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2634 const X86InstrInfo *TII, const CCValAssign &VA) { 2635 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2636 2637 for (;;) { 2638 // Look through nodes that don't alter the bits of the incoming value. 2639 unsigned Op = Arg.getOpcode(); 2640 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST || 2641 Op == ISD::AssertZext) { 2642 Arg = Arg.getOperand(0); 2643 continue; 2644 } 2645 if (Op == ISD::TRUNCATE) { 2646 const SDValue &TruncInput = Arg.getOperand(0); 2647 if (TruncInput.getOpcode() == ISD::AssertZext && 2648 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() == 2649 Arg.getValueType()) { 2650 Arg = TruncInput.getOperand(0); 2651 continue; 2652 } 2653 } 2654 break; 2655 } 2656 2657 int FI = INT_MAX; 2658 if (Arg.getOpcode() == ISD::CopyFromReg) { 2659 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2660 if (!VR.isVirtual()) 2661 return false; 2662 MachineInstr *Def = MRI->getVRegDef(VR); 2663 if (!Def) 2664 return false; 2665 if (!Flags.isByVal()) { 2666 if (!TII->isLoadFromStackSlot(*Def, FI)) 2667 return false; 2668 } else { 2669 unsigned Opcode = Def->getOpcode(); 2670 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || 2671 Opcode == X86::LEA64_32r) && 2672 Def->getOperand(1).isFI()) { 2673 FI = Def->getOperand(1).getIndex(); 2674 Bytes = Flags.getByValSize(); 2675 } else 2676 return false; 2677 } 2678 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2679 if (Flags.isByVal()) 2680 // ByVal argument is passed in as a pointer but it's now being 2681 // dereferenced. e.g. 2682 // define @foo(%struct.X* %A) { 2683 // tail call @bar(%struct.X* byval %A) 2684 // } 2685 return false; 2686 SDValue Ptr = Ld->getBasePtr(); 2687 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2688 if (!FINode) 2689 return false; 2690 FI = FINode->getIndex(); 2691 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2692 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2693 FI = FINode->getIndex(); 2694 Bytes = Flags.getByValSize(); 2695 } else 2696 return false; 2697 2698 assert(FI != INT_MAX); 2699 if (!MFI.isFixedObjectIndex(FI)) 2700 return false; 2701 2702 if (Offset != MFI.getObjectOffset(FI)) 2703 return false; 2704 2705 // If this is not byval, check that the argument stack object is immutable. 2706 // inalloca and argument copy elision can create mutable argument stack 2707 // objects. Byval objects can be mutated, but a byval call intends to pass the 2708 // mutated memory. 2709 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) 2710 return false; 2711 2712 if (VA.getLocVT().getFixedSizeInBits() > 2713 Arg.getValueSizeInBits().getFixedValue()) { 2714 // If the argument location is wider than the argument type, check that any 2715 // extension flags match. 2716 if (Flags.isZExt() != MFI.isObjectZExt(FI) || 2717 Flags.isSExt() != MFI.isObjectSExt(FI)) { 2718 return false; 2719 } 2720 } 2721 2722 return Bytes == MFI.getObjectSize(FI); 2723 } 2724 2725 /// Check whether the call is eligible for tail call optimization. Targets 2726 /// that want to do tail call optimization should implement this function. 2727 /// Note that the x86 backend does not check musttail calls for eligibility! The 2728 /// rest of x86 tail call lowering must be prepared to forward arguments of any 2729 /// type. 2730 bool X86TargetLowering::IsEligibleForTailCallOptimization( 2731 TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo, 2732 SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const { 2733 SelectionDAG &DAG = CLI.DAG; 2734 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2735 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2736 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2737 SDValue Callee = CLI.Callee; 2738 CallingConv::ID CalleeCC = CLI.CallConv; 2739 bool isVarArg = CLI.IsVarArg; 2740 2741 if (!mayTailCallThisCC(CalleeCC)) 2742 return false; 2743 2744 // If -tailcallopt is specified, make fastcc functions tail-callable. 2745 MachineFunction &MF = DAG.getMachineFunction(); 2746 const Function &CallerF = MF.getFunction(); 2747 2748 // If the function return type is x86_fp80 and the callee return type is not, 2749 // then the FP_EXTEND of the call result is not a nop. It's not safe to 2750 // perform a tailcall optimization here. 2751 if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty()) 2752 return false; 2753 2754 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2755 bool CCMatch = CallerCC == CalleeCC; 2756 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); 2757 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); 2758 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || 2759 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail; 2760 2761 // Win64 functions have extra shadow space for argument homing. Don't do the 2762 // sibcall if the caller and callee have mismatched expectations for this 2763 // space. 2764 if (IsCalleeWin64 != IsCallerWin64) 2765 return false; 2766 2767 if (IsGuaranteeTCO) { 2768 if (canGuaranteeTCO(CalleeCC) && CCMatch) 2769 return true; 2770 return false; 2771 } 2772 2773 // Look for obvious safe cases to perform tail call optimization that do not 2774 // require ABI changes. This is what gcc calls sibcall. 2775 2776 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2777 // emit a special epilogue. 2778 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); 2779 if (RegInfo->hasStackRealignment(MF)) 2780 return false; 2781 2782 // Also avoid sibcall optimization if we're an sret return fn and the callee 2783 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is 2784 // insufficient. 2785 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) { 2786 // For a compatible tail call the callee must return our sret pointer. So it 2787 // needs to be (a) an sret function itself and (b) we pass our sret as its 2788 // sret. Condition #b is harder to determine. 2789 return false; 2790 } else if (IsCalleePopSRet) 2791 // The callee pops an sret, so we cannot tail-call, as our caller doesn't 2792 // expect that. 2793 return false; 2794 2795 // Do not sibcall optimize vararg calls unless all arguments are passed via 2796 // registers. 2797 LLVMContext &C = *DAG.getContext(); 2798 if (isVarArg && !Outs.empty()) { 2799 // Optimizing for varargs on Win64 is unlikely to be safe without 2800 // additional testing. 2801 if (IsCalleeWin64 || IsCallerWin64) 2802 return false; 2803 2804 for (const auto &VA : ArgLocs) 2805 if (!VA.isRegLoc()) 2806 return false; 2807 } 2808 2809 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2810 // stack. Therefore, if it's not used by the call it is not safe to optimize 2811 // this into a sibcall. 2812 bool Unused = false; 2813 for (const auto &In : Ins) { 2814 if (!In.Used) { 2815 Unused = true; 2816 break; 2817 } 2818 } 2819 if (Unused) { 2820 SmallVector<CCValAssign, 16> RVLocs; 2821 CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C); 2822 RVCCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2823 for (const auto &VA : RVLocs) { 2824 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 2825 return false; 2826 } 2827 } 2828 2829 // Check that the call results are passed in the same way. 2830 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2831 RetCC_X86, RetCC_X86)) 2832 return false; 2833 // The callee has to preserve all registers the caller needs to preserve. 2834 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); 2835 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2836 if (!CCMatch) { 2837 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2838 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2839 return false; 2840 } 2841 2842 unsigned StackArgsSize = CCInfo.getStackSize(); 2843 2844 // If the callee takes no arguments then go on to check the results of the 2845 // call. 2846 if (!Outs.empty()) { 2847 if (StackArgsSize > 0) { 2848 // Check if the arguments are already laid out in the right way as 2849 // the caller's fixed stack objects. 2850 MachineFrameInfo &MFI = MF.getFrameInfo(); 2851 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2852 const X86InstrInfo *TII = Subtarget.getInstrInfo(); 2853 for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { 2854 const CCValAssign &VA = ArgLocs[I]; 2855 SDValue Arg = OutVals[I]; 2856 ISD::ArgFlagsTy Flags = Outs[I].Flags; 2857 if (VA.getLocInfo() == CCValAssign::Indirect) 2858 return false; 2859 if (!VA.isRegLoc()) { 2860 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, 2861 TII, VA)) 2862 return false; 2863 } 2864 } 2865 } 2866 2867 bool PositionIndependent = isPositionIndependent(); 2868 // If the tailcall address may be in a register, then make sure it's 2869 // possible to register allocate for it. In 32-bit, the call address can 2870 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2871 // callee-saved registers are restored. These happen to be the same 2872 // registers used to pass 'inreg' arguments so watch out for those. 2873 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && 2874 !isa<ExternalSymbolSDNode>(Callee)) || 2875 PositionIndependent)) { 2876 unsigned NumInRegs = 0; 2877 // In PIC we need an extra register to formulate the address computation 2878 // for the callee. 2879 unsigned MaxInRegs = PositionIndependent ? 2 : 3; 2880 2881 for (const auto &VA : ArgLocs) { 2882 if (!VA.isRegLoc()) 2883 continue; 2884 Register Reg = VA.getLocReg(); 2885 switch (Reg) { 2886 default: break; 2887 case X86::EAX: case X86::EDX: case X86::ECX: 2888 if (++NumInRegs == MaxInRegs) 2889 return false; 2890 break; 2891 } 2892 } 2893 } 2894 2895 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2896 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2897 return false; 2898 } 2899 2900 bool CalleeWillPop = 2901 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, 2902 MF.getTarget().Options.GuaranteedTailCallOpt); 2903 2904 if (unsigned BytesToPop = 2905 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { 2906 // If we have bytes to pop, the callee must pop them. 2907 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; 2908 if (!CalleePopMatches) 2909 return false; 2910 } else if (CalleeWillPop && StackArgsSize > 0) { 2911 // If we don't have bytes to pop, make sure the callee doesn't pop any. 2912 return false; 2913 } 2914 2915 return true; 2916 } 2917 2918 /// Determines whether the callee is required to pop its own arguments. 2919 /// Callee pop is necessary to support tail calls. 2920 bool X86::isCalleePop(CallingConv::ID CallingConv, 2921 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { 2922 // If GuaranteeTCO is true, we force some calls to be callee pop so that we 2923 // can guarantee TCO. 2924 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) 2925 return true; 2926 2927 switch (CallingConv) { 2928 default: 2929 return false; 2930 case CallingConv::X86_StdCall: 2931 case CallingConv::X86_FastCall: 2932 case CallingConv::X86_ThisCall: 2933 case CallingConv::X86_VectorCall: 2934 return !is64Bit; 2935 } 2936 } 2937