1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 Has16BitInsts(false), 180 HasMadMixInsts(false), 181 HasMadMacF32Insts(false), 182 HasDsSrc2Insts(false), 183 HasSDWA(false), 184 HasVOP3PInsts(false), 185 HasMulI24(true), 186 HasMulU24(true), 187 HasInv2PiInlineImm(false), 188 HasFminFmaxLegacy(true), 189 EnablePromoteAlloca(false), 190 HasTrigReducedRange(false), 191 MaxWavesPerEU(10), 192 LocalMemorySize(0), 193 WavefrontSizeLog2(0) 194 { } 195 196 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 197 const GCNTargetMachine &TM) : 198 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 199 AMDGPUSubtarget(TT), 200 TargetTriple(TT), 201 TargetID(*this), 202 Gen(INVALID), 203 InstrItins(getInstrItineraryForCPU(GPU)), 204 LDSBankCount(0), 205 MaxPrivateElementSize(0), 206 207 FastFMAF32(false), 208 FastDenormalF32(false), 209 HalfRate64Ops(false), 210 211 FlatForGlobal(false), 212 AutoWaitcntBeforeBarrier(false), 213 UnalignedScratchAccess(false), 214 UnalignedAccessMode(false), 215 216 HasApertureRegs(false), 217 SupportsXNACK(false), 218 EnableXNACK(false), 219 EnableCuMode(false), 220 TrapHandler(false), 221 222 EnableLoadStoreOpt(false), 223 EnableUnsafeDSOffsetFolding(false), 224 EnableSIScheduler(false), 225 EnableDS128(false), 226 EnablePRTStrictNull(false), 227 DumpCode(false), 228 229 FP64(false), 230 GCN3Encoding(false), 231 CIInsts(false), 232 GFX8Insts(false), 233 GFX9Insts(false), 234 GFX10Insts(false), 235 GFX10_3Insts(false), 236 GFX7GFX8GFX9Insts(false), 237 SGPRInitBug(false), 238 HasSMemRealTime(false), 239 HasIntClamp(false), 240 HasFmaMixInsts(false), 241 HasMovrel(false), 242 HasVGPRIndexMode(false), 243 HasScalarStores(false), 244 HasScalarAtomics(false), 245 HasSDWAOmod(false), 246 HasSDWAScalar(false), 247 HasSDWASdst(false), 248 HasSDWAMac(false), 249 HasSDWAOutModsVOPC(false), 250 HasDPP(false), 251 HasDPP8(false), 252 HasR128A16(false), 253 HasGFX10A16(false), 254 HasG16(false), 255 HasNSAEncoding(false), 256 GFX10_BEncoding(false), 257 HasDLInsts(false), 258 HasDot1Insts(false), 259 HasDot2Insts(false), 260 HasDot3Insts(false), 261 HasDot4Insts(false), 262 HasDot5Insts(false), 263 HasDot6Insts(false), 264 HasMAIInsts(false), 265 HasPkFmacF16Inst(false), 266 HasAtomicFaddInsts(false), 267 SupportsSRAMECC(false), 268 EnableSRAMECC(false), 269 HasNoSdstCMPX(false), 270 HasVscnt(false), 271 HasGetWaveIdInst(false), 272 HasSMemTimeInst(false), 273 HasRegisterBanking(false), 274 HasVOP3Literal(false), 275 HasNoDataDepHazard(false), 276 FlatAddressSpace(false), 277 FlatInstOffsets(false), 278 FlatGlobalInsts(false), 279 FlatScratchInsts(false), 280 ScalarFlatScratchInsts(false), 281 AddNoCarryInsts(false), 282 HasUnpackedD16VMem(false), 283 LDSMisalignedBug(false), 284 HasMFMAInlineLiteralBug(false), 285 UnalignedBufferAccess(false), 286 UnalignedDSAccess(false), 287 288 ScalarizeGlobal(false), 289 290 HasVcmpxPermlaneHazard(false), 291 HasVMEMtoScalarWriteHazard(false), 292 HasSMEMtoVectorWriteHazard(false), 293 HasInstFwdPrefetchBug(false), 294 HasVcmpxExecWARHazard(false), 295 HasLdsBranchVmemWARHazard(false), 296 HasNSAtoVMEMBug(false), 297 HasOffset3fBug(false), 298 HasFlatSegmentOffsetBug(false), 299 HasImageStoreD16Bug(false), 300 HasImageGather4D16Bug(false), 301 302 FeatureDisable(false), 303 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 304 TLInfo(TM, *this), 305 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 306 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 307 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 308 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 309 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 310 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 311 InstSelector.reset(new AMDGPUInstructionSelector( 312 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 313 } 314 315 bool GCNSubtarget::enableFlatScratch() const { 316 return EnableFlatScratch && hasFlatScratchInsts(); 317 } 318 319 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 320 if (getGeneration() < GFX10) 321 return 1; 322 323 switch (Opcode) { 324 case AMDGPU::V_LSHLREV_B64_e64: 325 case AMDGPU::V_LSHLREV_B64_gfx10: 326 case AMDGPU::V_LSHL_B64_e64: 327 case AMDGPU::V_LSHRREV_B64_e64: 328 case AMDGPU::V_LSHRREV_B64_gfx10: 329 case AMDGPU::V_LSHR_B64_e64: 330 case AMDGPU::V_ASHRREV_I64_e64: 331 case AMDGPU::V_ASHRREV_I64_gfx10: 332 case AMDGPU::V_ASHR_I64_e64: 333 return 1; 334 } 335 336 return 2; 337 } 338 339 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 340 const Function &F) const { 341 if (NWaves == 1) 342 return getLocalMemorySize(); 343 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 344 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 345 if (!WorkGroupsPerCu) 346 return 0; 347 unsigned MaxWaves = getMaxWavesPerEU(); 348 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 349 } 350 351 // FIXME: Should return min,max range. 352 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 353 const Function &F) const { 354 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 355 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 356 if (!MaxWorkGroupsPerCu) 357 return 0; 358 359 const unsigned WaveSize = getWavefrontSize(); 360 361 // FIXME: Do we need to account for alignment requirement of LDS rounding the 362 // size up? 363 // Compute restriction based on LDS usage 364 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 365 366 // This can be queried with more LDS than is possible, so just assume the 367 // worst. 368 if (NumGroups == 0) 369 return 1; 370 371 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 372 373 // Round to the number of waves. 374 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 375 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 376 377 // Clamp to the maximum possible number of waves. 378 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 379 380 // FIXME: Needs to be a multiple of the group size? 381 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 382 383 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 384 "computed invalid occupancy"); 385 return MaxWaves; 386 } 387 388 unsigned 389 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 390 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 391 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 392 } 393 394 std::pair<unsigned, unsigned> 395 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 396 switch (CC) { 397 case CallingConv::AMDGPU_VS: 398 case CallingConv::AMDGPU_LS: 399 case CallingConv::AMDGPU_HS: 400 case CallingConv::AMDGPU_ES: 401 case CallingConv::AMDGPU_GS: 402 case CallingConv::AMDGPU_PS: 403 return std::make_pair(1, getWavefrontSize()); 404 default: 405 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 406 } 407 } 408 409 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 410 const Function &F) const { 411 // Default minimum/maximum flat work group sizes. 412 std::pair<unsigned, unsigned> Default = 413 getDefaultFlatWorkGroupSize(F.getCallingConv()); 414 415 // Requested minimum/maximum flat work group sizes. 416 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 417 F, "amdgpu-flat-work-group-size", Default); 418 419 // Make sure requested minimum is less than requested maximum. 420 if (Requested.first > Requested.second) 421 return Default; 422 423 // Make sure requested values do not violate subtarget's specifications. 424 if (Requested.first < getMinFlatWorkGroupSize()) 425 return Default; 426 if (Requested.second > getMaxFlatWorkGroupSize()) 427 return Default; 428 429 return Requested; 430 } 431 432 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 433 const Function &F) const { 434 // Default minimum/maximum number of waves per execution unit. 435 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 436 437 // Default/requested minimum/maximum flat work group sizes. 438 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 439 440 // If minimum/maximum flat work group sizes were explicitly requested using 441 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 442 // number of waves per execution unit to values implied by requested 443 // minimum/maximum flat work group sizes. 444 unsigned MinImpliedByFlatWorkGroupSize = 445 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 446 Default.first = MinImpliedByFlatWorkGroupSize; 447 bool RequestedFlatWorkGroupSize = 448 F.hasFnAttribute("amdgpu-flat-work-group-size"); 449 450 // Requested minimum/maximum number of waves per execution unit. 451 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 452 F, "amdgpu-waves-per-eu", Default, true); 453 454 // Make sure requested minimum is less than requested maximum. 455 if (Requested.second && Requested.first > Requested.second) 456 return Default; 457 458 // Make sure requested values do not violate subtarget's specifications. 459 if (Requested.first < getMinWavesPerEU() || 460 Requested.second > getMaxWavesPerEU()) 461 return Default; 462 463 // Make sure requested values are compatible with values implied by requested 464 // minimum/maximum flat work group sizes. 465 if (RequestedFlatWorkGroupSize && 466 Requested.first < MinImpliedByFlatWorkGroupSize) 467 return Default; 468 469 return Requested; 470 } 471 472 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 473 auto Node = Kernel.getMetadata("reqd_work_group_size"); 474 if (Node && Node->getNumOperands() == 3) 475 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 476 return std::numeric_limits<unsigned>::max(); 477 } 478 479 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 480 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 481 } 482 483 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 484 unsigned Dimension) const { 485 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 486 if (ReqdSize != std::numeric_limits<unsigned>::max()) 487 return ReqdSize - 1; 488 return getFlatWorkGroupSizes(Kernel).second - 1; 489 } 490 491 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 492 Function *Kernel = I->getParent()->getParent(); 493 unsigned MinSize = 0; 494 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 495 bool IdQuery = false; 496 497 // If reqd_work_group_size is present it narrows value down. 498 if (auto *CI = dyn_cast<CallInst>(I)) { 499 const Function *F = CI->getCalledFunction(); 500 if (F) { 501 unsigned Dim = UINT_MAX; 502 switch (F->getIntrinsicID()) { 503 case Intrinsic::amdgcn_workitem_id_x: 504 case Intrinsic::r600_read_tidig_x: 505 IdQuery = true; 506 LLVM_FALLTHROUGH; 507 case Intrinsic::r600_read_local_size_x: 508 Dim = 0; 509 break; 510 case Intrinsic::amdgcn_workitem_id_y: 511 case Intrinsic::r600_read_tidig_y: 512 IdQuery = true; 513 LLVM_FALLTHROUGH; 514 case Intrinsic::r600_read_local_size_y: 515 Dim = 1; 516 break; 517 case Intrinsic::amdgcn_workitem_id_z: 518 case Intrinsic::r600_read_tidig_z: 519 IdQuery = true; 520 LLVM_FALLTHROUGH; 521 case Intrinsic::r600_read_local_size_z: 522 Dim = 2; 523 break; 524 default: 525 break; 526 } 527 528 if (Dim <= 3) { 529 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 530 if (ReqdSize != std::numeric_limits<unsigned>::max()) 531 MinSize = MaxSize = ReqdSize; 532 } 533 } 534 } 535 536 if (!MaxSize) 537 return false; 538 539 // Range metadata is [Lo, Hi). For ID query we need to pass max size 540 // as Hi. For size query we need to pass Hi + 1. 541 if (IdQuery) 542 MinSize = 0; 543 else 544 ++MaxSize; 545 546 MDBuilder MDB(I->getContext()); 547 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 548 APInt(32, MaxSize)); 549 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 550 return true; 551 } 552 553 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 554 if (isMesaKernel(F)) 555 return 16; 556 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 557 } 558 559 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 560 Align &MaxAlign) const { 561 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 562 F.getCallingConv() == CallingConv::SPIR_KERNEL); 563 564 const DataLayout &DL = F.getParent()->getDataLayout(); 565 uint64_t ExplicitArgBytes = 0; 566 MaxAlign = Align(1); 567 568 for (const Argument &Arg : F.args()) { 569 const bool IsByRef = Arg.hasByRefAttr(); 570 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 571 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 572 if (!Alignment) 573 Alignment = DL.getABITypeAlign(ArgTy); 574 575 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 576 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 577 MaxAlign = max(MaxAlign, Alignment); 578 } 579 580 return ExplicitArgBytes; 581 } 582 583 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 584 Align &MaxAlign) const { 585 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 586 587 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 588 589 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 590 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 591 if (ImplicitBytes != 0) { 592 const Align Alignment = getAlignmentForImplicitArgPtr(); 593 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 594 } 595 596 // Being able to dereference past the end is useful for emitting scalar loads. 597 return alignTo(TotalSize, 4); 598 } 599 600 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 601 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 602 : AMDGPUDwarfFlavour::Wave64; 603 } 604 605 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 606 const TargetMachine &TM) : 607 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 608 AMDGPUSubtarget(TT), 609 InstrInfo(*this), 610 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 611 FMA(false), 612 CaymanISA(false), 613 CFALUBug(false), 614 HasVertexCache(false), 615 R600ALUInst(false), 616 FP64(false), 617 TexVTXClauseSize(0), 618 Gen(R600), 619 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 620 InstrItins(getInstrItineraryForCPU(GPU)) { } 621 622 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 623 unsigned NumRegionInstrs) const { 624 // Track register pressure so the scheduler can try to decrease 625 // pressure once register usage is above the threshold defined by 626 // SIRegisterInfo::getRegPressureSetLimit() 627 Policy.ShouldTrackPressure = true; 628 629 // Enabling both top down and bottom up scheduling seems to give us less 630 // register spills than just using one of these approaches on its own. 631 Policy.OnlyTopDown = false; 632 Policy.OnlyBottomUp = false; 633 634 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 635 if (!enableSIScheduler()) 636 Policy.ShouldTrackLaneMasks = true; 637 } 638 639 bool GCNSubtarget::hasMadF16() const { 640 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 641 } 642 643 bool GCNSubtarget::useVGPRIndexMode() const { 644 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 645 } 646 647 bool GCNSubtarget::useAA() const { return UseAA; } 648 649 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 650 if (getGeneration() >= AMDGPUSubtarget::GFX10) 651 return getMaxWavesPerEU(); 652 653 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 654 if (SGPRs <= 80) 655 return 10; 656 if (SGPRs <= 88) 657 return 9; 658 if (SGPRs <= 100) 659 return 8; 660 return 7; 661 } 662 if (SGPRs <= 48) 663 return 10; 664 if (SGPRs <= 56) 665 return 9; 666 if (SGPRs <= 64) 667 return 8; 668 if (SGPRs <= 72) 669 return 7; 670 if (SGPRs <= 80) 671 return 6; 672 return 5; 673 } 674 675 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 676 unsigned MaxWaves = getMaxWavesPerEU(); 677 unsigned Granule = getVGPRAllocGranule(); 678 if (VGPRs < Granule) 679 return MaxWaves; 680 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 681 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 682 } 683 684 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 685 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 686 if (getGeneration() >= AMDGPUSubtarget::GFX10) 687 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 688 689 if (MFI.hasFlatScratchInit()) { 690 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 691 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 692 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 693 return 4; // FLAT_SCRATCH, VCC (in that order). 694 } 695 696 if (isXNACKEnabled()) 697 return 4; // XNACK, VCC (in that order). 698 return 2; // VCC. 699 } 700 701 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 702 unsigned NumSGPRs, 703 unsigned NumVGPRs) const { 704 unsigned Occupancy = 705 std::min(getMaxWavesPerEU(), 706 getOccupancyWithLocalMemSize(LDSSize, F)); 707 if (NumSGPRs) 708 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 709 if (NumVGPRs) 710 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 711 return Occupancy; 712 } 713 714 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 715 const Function &F = MF.getFunction(); 716 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 717 718 // Compute maximum number of SGPRs function can use using default/requested 719 // minimum number of waves per execution unit. 720 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 721 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 722 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 723 724 // Check if maximum number of SGPRs was explicitly requested using 725 // "amdgpu-num-sgpr" attribute. 726 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 727 unsigned Requested = AMDGPU::getIntegerAttribute( 728 F, "amdgpu-num-sgpr", MaxNumSGPRs); 729 730 // Make sure requested value does not violate subtarget's specifications. 731 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 732 Requested = 0; 733 734 // If more SGPRs are required to support the input user/system SGPRs, 735 // increase to accommodate them. 736 // 737 // FIXME: This really ends up using the requested number of SGPRs + number 738 // of reserved special registers in total. Theoretically you could re-use 739 // the last input registers for these special registers, but this would 740 // require a lot of complexity to deal with the weird aliasing. 741 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 742 if (Requested && Requested < InputNumSGPRs) 743 Requested = InputNumSGPRs; 744 745 // Make sure requested value is compatible with values implied by 746 // default/requested minimum/maximum number of waves per execution unit. 747 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 748 Requested = 0; 749 if (WavesPerEU.second && 750 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 751 Requested = 0; 752 753 if (Requested) 754 MaxNumSGPRs = Requested; 755 } 756 757 if (hasSGPRInitBug()) 758 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 759 760 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 761 MaxAddressableNumSGPRs); 762 } 763 764 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 765 const Function &F = MF.getFunction(); 766 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 767 768 // Compute maximum number of VGPRs function can use using default/requested 769 // minimum number of waves per execution unit. 770 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 771 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 772 773 // Check if maximum number of VGPRs was explicitly requested using 774 // "amdgpu-num-vgpr" attribute. 775 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 776 unsigned Requested = AMDGPU::getIntegerAttribute( 777 F, "amdgpu-num-vgpr", MaxNumVGPRs); 778 779 // Make sure requested value is compatible with values implied by 780 // default/requested minimum/maximum number of waves per execution unit. 781 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 782 Requested = 0; 783 if (WavesPerEU.second && 784 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 785 Requested = 0; 786 787 if (Requested) 788 MaxNumVGPRs = Requested; 789 } 790 791 return MaxNumVGPRs; 792 } 793 794 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 795 int UseOpIdx, SDep &Dep) const { 796 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 797 !Def->isInstr() || !Use->isInstr()) 798 return; 799 800 MachineInstr *DefI = Def->getInstr(); 801 MachineInstr *UseI = Use->getInstr(); 802 803 if (DefI->isBundle()) { 804 const SIRegisterInfo *TRI = getRegisterInfo(); 805 auto Reg = Dep.getReg(); 806 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 807 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 808 unsigned Lat = 0; 809 for (++I; I != E && I->isBundledWithPred(); ++I) { 810 if (I->modifiesRegister(Reg, TRI)) 811 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 812 else if (Lat) 813 --Lat; 814 } 815 Dep.setLatency(Lat); 816 } else if (UseI->isBundle()) { 817 const SIRegisterInfo *TRI = getRegisterInfo(); 818 auto Reg = Dep.getReg(); 819 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 820 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 821 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 822 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 823 if (I->readsRegister(Reg, TRI)) 824 break; 825 --Lat; 826 } 827 Dep.setLatency(Lat); 828 } 829 } 830 831 namespace { 832 struct FillMFMAShadowMutation : ScheduleDAGMutation { 833 const SIInstrInfo *TII; 834 835 ScheduleDAGMI *DAG; 836 837 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 838 839 bool isSALU(const SUnit *SU) const { 840 const MachineInstr *MI = SU->getInstr(); 841 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 842 } 843 844 bool isVALU(const SUnit *SU) const { 845 const MachineInstr *MI = SU->getInstr(); 846 return MI && TII->isVALU(*MI); 847 } 848 849 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 850 if (Pred->NodeNum < Succ->NodeNum) 851 return true; 852 853 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 854 855 for (unsigned I = 0; I < Succs.size(); ++I) { 856 for (const SDep &SI : Succs[I]->Succs) { 857 const SUnit *SU = SI.getSUnit(); 858 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 859 Succs.push_back(SU); 860 } 861 } 862 863 SmallPtrSet<const SUnit*, 32> Visited; 864 while (!Preds.empty()) { 865 const SUnit *SU = Preds.pop_back_val(); 866 if (llvm::is_contained(Succs, SU)) 867 return false; 868 Visited.insert(SU); 869 for (const SDep &SI : SU->Preds) 870 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 871 Preds.push_back(SI.getSUnit()); 872 } 873 874 return true; 875 } 876 877 // Link as much SALU intructions in chain as possible. Return the size 878 // of the chain. Links up to MaxChain instructions. 879 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 880 SmallPtrSetImpl<SUnit *> &Visited) const { 881 SmallVector<SUnit *, 8> Worklist({To}); 882 unsigned Linked = 0; 883 884 while (!Worklist.empty() && MaxChain-- > 0) { 885 SUnit *SU = Worklist.pop_back_val(); 886 if (!Visited.insert(SU).second) 887 continue; 888 889 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 890 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 891 892 if (SU->addPred(SDep(From, SDep::Artificial), false)) 893 ++Linked; 894 895 for (SDep &SI : From->Succs) { 896 SUnit *SUv = SI.getSUnit(); 897 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 898 SUv->addPred(SDep(SU, SDep::Artificial), false); 899 } 900 901 for (SDep &SI : SU->Succs) { 902 SUnit *Succ = SI.getSUnit(); 903 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 904 Worklist.push_back(Succ); 905 } 906 } 907 908 return Linked; 909 } 910 911 void apply(ScheduleDAGInstrs *DAGInstrs) override { 912 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 913 if (!ST.hasMAIInsts() || DisablePowerSched) 914 return; 915 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 916 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 917 if (!TSchedModel || DAG->SUnits.empty()) 918 return; 919 920 // Scan for MFMA long latency instructions and try to add a dependency 921 // of available SALU instructions to give them a chance to fill MFMA 922 // shadow. That is desirable to fill MFMA shadow with SALU instructions 923 // rather than VALU to prevent power consumption bursts and throttle. 924 auto LastSALU = DAG->SUnits.begin(); 925 auto E = DAG->SUnits.end(); 926 SmallPtrSet<SUnit*, 32> Visited; 927 for (SUnit &SU : DAG->SUnits) { 928 MachineInstr &MAI = *SU.getInstr(); 929 if (!TII->isMAI(MAI) || 930 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 931 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 932 continue; 933 934 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 935 936 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 937 dbgs() << "Need " << Lat 938 << " instructions to cover latency.\n"); 939 940 // Find up to Lat independent scalar instructions as early as 941 // possible such that they can be scheduled after this MFMA. 942 for ( ; Lat && LastSALU != E; ++LastSALU) { 943 if (Visited.count(&*LastSALU)) 944 continue; 945 946 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 947 continue; 948 949 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 950 } 951 } 952 } 953 }; 954 } // namespace 955 956 void GCNSubtarget::getPostRAMutations( 957 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 958 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 959 } 960 961 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 962 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 963 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 964 else 965 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 966 } 967 968 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 969 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 970 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 971 else 972 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 973 } 974