1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSizeLog2 == 0) 130 WavefrontSizeLog2 = 5; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 HasMadMacF32Insts(false), 157 HasDsSrc2Insts(false), 158 HasSDWA(false), 159 HasVOP3PInsts(false), 160 HasMulI24(true), 161 HasMulU24(true), 162 HasInv2PiInlineImm(false), 163 HasFminFmaxLegacy(true), 164 EnablePromoteAlloca(false), 165 HasTrigReducedRange(false), 166 MaxWavesPerEU(10), 167 LocalMemorySize(0), 168 WavefrontSizeLog2(0) 169 { } 170 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 172 const GCNTargetMachine &TM) : 173 AMDGPUGenSubtargetInfo(TT, GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 LDSBankCount(0), 179 MaxPrivateElementSize(0), 180 181 FastFMAF32(false), 182 FastDenormalF32(false), 183 HalfRate64Ops(false), 184 185 FlatForGlobal(false), 186 AutoWaitcntBeforeBarrier(false), 187 CodeObjectV3(false), 188 UnalignedScratchAccess(false), 189 UnalignedBufferAccess(false), 190 191 HasApertureRegs(false), 192 EnableXNACK(false), 193 DoesNotSupportXNACK(false), 194 EnableCuMode(false), 195 TrapHandler(false), 196 197 EnableLoadStoreOpt(false), 198 EnableUnsafeDSOffsetFolding(false), 199 EnableSIScheduler(false), 200 EnableDS128(false), 201 EnablePRTStrictNull(false), 202 DumpCode(false), 203 204 FP64(false), 205 GCN3Encoding(false), 206 CIInsts(false), 207 GFX8Insts(false), 208 GFX9Insts(false), 209 GFX10Insts(false), 210 GFX10_3Insts(false), 211 GFX7GFX8GFX9Insts(false), 212 SGPRInitBug(false), 213 HasSMemRealTime(false), 214 HasIntClamp(false), 215 HasFmaMixInsts(false), 216 HasMovrel(false), 217 HasVGPRIndexMode(false), 218 HasScalarStores(false), 219 HasScalarAtomics(false), 220 HasSDWAOmod(false), 221 HasSDWAScalar(false), 222 HasSDWASdst(false), 223 HasSDWAMac(false), 224 HasSDWAOutModsVOPC(false), 225 HasDPP(false), 226 HasDPP8(false), 227 HasR128A16(false), 228 HasGFX10A16(false), 229 HasG16(false), 230 HasNSAEncoding(false), 231 GFX10_BEncoding(false), 232 HasDLInsts(false), 233 HasDot1Insts(false), 234 HasDot2Insts(false), 235 HasDot3Insts(false), 236 HasDot4Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 HasMAIInsts(false), 240 HasPkFmacF16Inst(false), 241 HasAtomicFaddInsts(false), 242 EnableSRAMECC(false), 243 DoesNotSupportSRAMECC(false), 244 HasNoSdstCMPX(false), 245 HasVscnt(false), 246 HasGetWaveIdInst(false), 247 HasSMemTimeInst(false), 248 HasRegisterBanking(false), 249 HasVOP3Literal(false), 250 HasNoDataDepHazard(false), 251 FlatAddressSpace(false), 252 FlatInstOffsets(false), 253 FlatGlobalInsts(false), 254 FlatScratchInsts(false), 255 ScalarFlatScratchInsts(false), 256 AddNoCarryInsts(false), 257 HasUnpackedD16VMem(false), 258 LDSMisalignedBug(false), 259 HasMFMAInlineLiteralBug(false), 260 261 ScalarizeGlobal(false), 262 263 HasVcmpxPermlaneHazard(false), 264 HasVMEMtoScalarWriteHazard(false), 265 HasSMEMtoVectorWriteHazard(false), 266 HasInstFwdPrefetchBug(false), 267 HasVcmpxExecWARHazard(false), 268 HasLdsBranchVmemWARHazard(false), 269 HasNSAtoVMEMBug(false), 270 HasOffset3fBug(false), 271 HasFlatSegmentOffsetBug(false), 272 273 FeatureDisable(false), 274 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 275 TLInfo(TM, *this), 276 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 277 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 278 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 279 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 280 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 281 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 282 InstSelector.reset(new AMDGPUInstructionSelector( 283 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 284 } 285 286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 287 if (getGeneration() < GFX10) 288 return 1; 289 290 switch (Opcode) { 291 case AMDGPU::V_LSHLREV_B64: 292 case AMDGPU::V_LSHLREV_B64_gfx10: 293 case AMDGPU::V_LSHL_B64: 294 case AMDGPU::V_LSHRREV_B64: 295 case AMDGPU::V_LSHRREV_B64_gfx10: 296 case AMDGPU::V_LSHR_B64: 297 case AMDGPU::V_ASHRREV_I64: 298 case AMDGPU::V_ASHRREV_I64_gfx10: 299 case AMDGPU::V_ASHR_I64: 300 return 1; 301 } 302 303 return 2; 304 } 305 306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 307 const Function &F) const { 308 if (NWaves == 1) 309 return getLocalMemorySize(); 310 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 311 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 312 if (!WorkGroupsPerCu) 313 return 0; 314 unsigned MaxWaves = getMaxWavesPerEU(); 315 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 316 } 317 318 // FIXME: Should return min,max range. 319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 320 const Function &F) const { 321 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 322 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 323 if (!MaxWorkGroupsPerCu) 324 return 0; 325 326 const unsigned WaveSize = getWavefrontSize(); 327 328 // FIXME: Do we need to account for alignment requirement of LDS rounding the 329 // size up? 330 // Compute restriction based on LDS usage 331 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 332 333 // This can be queried with more LDS than is possible, so just assume the 334 // worst. 335 if (NumGroups == 0) 336 return 1; 337 338 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 339 340 // Round to the number of waves. 341 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 342 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 343 344 // Clamp to the maximum possible number of waves. 345 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 346 347 // FIXME: Needs to be a multiple of the group size? 348 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 349 350 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 351 "computed invalid occupancy"); 352 return MaxWaves; 353 } 354 355 unsigned 356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 357 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 358 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 359 } 360 361 std::pair<unsigned, unsigned> 362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 363 switch (CC) { 364 case CallingConv::AMDGPU_VS: 365 case CallingConv::AMDGPU_LS: 366 case CallingConv::AMDGPU_HS: 367 case CallingConv::AMDGPU_ES: 368 case CallingConv::AMDGPU_GS: 369 case CallingConv::AMDGPU_PS: 370 return std::make_pair(1, getWavefrontSize()); 371 default: 372 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 373 } 374 } 375 376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 377 const Function &F) const { 378 // Default minimum/maximum flat work group sizes. 379 std::pair<unsigned, unsigned> Default = 380 getDefaultFlatWorkGroupSize(F.getCallingConv()); 381 382 // Requested minimum/maximum flat work group sizes. 383 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 384 F, "amdgpu-flat-work-group-size", Default); 385 386 // Make sure requested minimum is less than requested maximum. 387 if (Requested.first > Requested.second) 388 return Default; 389 390 // Make sure requested values do not violate subtarget's specifications. 391 if (Requested.first < getMinFlatWorkGroupSize()) 392 return Default; 393 if (Requested.second > getMaxFlatWorkGroupSize()) 394 return Default; 395 396 return Requested; 397 } 398 399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 400 const Function &F) const { 401 // Default minimum/maximum number of waves per execution unit. 402 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 403 404 // Default/requested minimum/maximum flat work group sizes. 405 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 406 407 // If minimum/maximum flat work group sizes were explicitly requested using 408 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 409 // number of waves per execution unit to values implied by requested 410 // minimum/maximum flat work group sizes. 411 unsigned MinImpliedByFlatWorkGroupSize = 412 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 413 Default.first = MinImpliedByFlatWorkGroupSize; 414 bool RequestedFlatWorkGroupSize = 415 F.hasFnAttribute("amdgpu-flat-work-group-size"); 416 417 // Requested minimum/maximum number of waves per execution unit. 418 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 419 F, "amdgpu-waves-per-eu", Default, true); 420 421 // Make sure requested minimum is less than requested maximum. 422 if (Requested.second && Requested.first > Requested.second) 423 return Default; 424 425 // Make sure requested values do not violate subtarget's specifications. 426 if (Requested.first < getMinWavesPerEU() || 427 Requested.second > getMaxWavesPerEU()) 428 return Default; 429 430 // Make sure requested values are compatible with values implied by requested 431 // minimum/maximum flat work group sizes. 432 if (RequestedFlatWorkGroupSize && 433 Requested.first < MinImpliedByFlatWorkGroupSize) 434 return Default; 435 436 return Requested; 437 } 438 439 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 440 Function *Kernel = I->getParent()->getParent(); 441 unsigned MinSize = 0; 442 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 443 bool IdQuery = false; 444 445 // If reqd_work_group_size is present it narrows value down. 446 if (auto *CI = dyn_cast<CallInst>(I)) { 447 const Function *F = CI->getCalledFunction(); 448 if (F) { 449 unsigned Dim = UINT_MAX; 450 switch (F->getIntrinsicID()) { 451 case Intrinsic::amdgcn_workitem_id_x: 452 case Intrinsic::r600_read_tidig_x: 453 IdQuery = true; 454 LLVM_FALLTHROUGH; 455 case Intrinsic::r600_read_local_size_x: 456 Dim = 0; 457 break; 458 case Intrinsic::amdgcn_workitem_id_y: 459 case Intrinsic::r600_read_tidig_y: 460 IdQuery = true; 461 LLVM_FALLTHROUGH; 462 case Intrinsic::r600_read_local_size_y: 463 Dim = 1; 464 break; 465 case Intrinsic::amdgcn_workitem_id_z: 466 case Intrinsic::r600_read_tidig_z: 467 IdQuery = true; 468 LLVM_FALLTHROUGH; 469 case Intrinsic::r600_read_local_size_z: 470 Dim = 2; 471 break; 472 default: 473 break; 474 } 475 if (Dim <= 3) { 476 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 477 if (Node->getNumOperands() == 3) 478 MinSize = MaxSize = mdconst::extract<ConstantInt>( 479 Node->getOperand(Dim))->getZExtValue(); 480 } 481 } 482 } 483 484 if (!MaxSize) 485 return false; 486 487 // Range metadata is [Lo, Hi). For ID query we need to pass max size 488 // as Hi. For size query we need to pass Hi + 1. 489 if (IdQuery) 490 MinSize = 0; 491 else 492 ++MaxSize; 493 494 MDBuilder MDB(I->getContext()); 495 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 496 APInt(32, MaxSize)); 497 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 498 return true; 499 } 500 501 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 502 Align &MaxAlign) const { 503 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 504 F.getCallingConv() == CallingConv::SPIR_KERNEL); 505 506 const DataLayout &DL = F.getParent()->getDataLayout(); 507 uint64_t ExplicitArgBytes = 0; 508 MaxAlign = Align(1); 509 510 for (const Argument &Arg : F.args()) { 511 Type *ArgTy = Arg.getType(); 512 513 const Align Alignment = DL.getABITypeAlign(ArgTy); 514 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 515 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 516 MaxAlign = std::max(MaxAlign, Alignment); 517 } 518 519 return ExplicitArgBytes; 520 } 521 522 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 523 Align &MaxAlign) const { 524 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 525 526 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 527 528 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 529 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 530 if (ImplicitBytes != 0) { 531 const Align Alignment = getAlignmentForImplicitArgPtr(); 532 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 533 } 534 535 // Being able to dereference past the end is useful for emitting scalar loads. 536 return alignTo(TotalSize, 4); 537 } 538 539 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 540 const TargetMachine &TM) : 541 R600GenSubtargetInfo(TT, GPU, FS), 542 AMDGPUSubtarget(TT), 543 InstrInfo(*this), 544 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 545 FMA(false), 546 CaymanISA(false), 547 CFALUBug(false), 548 HasVertexCache(false), 549 R600ALUInst(false), 550 FP64(false), 551 TexVTXClauseSize(0), 552 Gen(R600), 553 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 554 InstrItins(getInstrItineraryForCPU(GPU)) { } 555 556 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 557 unsigned NumRegionInstrs) const { 558 // Track register pressure so the scheduler can try to decrease 559 // pressure once register usage is above the threshold defined by 560 // SIRegisterInfo::getRegPressureSetLimit() 561 Policy.ShouldTrackPressure = true; 562 563 // Enabling both top down and bottom up scheduling seems to give us less 564 // register spills than just using one of these approaches on its own. 565 Policy.OnlyTopDown = false; 566 Policy.OnlyBottomUp = false; 567 568 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 569 if (!enableSIScheduler()) 570 Policy.ShouldTrackLaneMasks = true; 571 } 572 573 bool GCNSubtarget::hasMadF16() const { 574 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 575 } 576 577 bool GCNSubtarget::useVGPRIndexMode() const { 578 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 579 } 580 581 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 582 if (getGeneration() >= AMDGPUSubtarget::GFX10) 583 return getMaxWavesPerEU(); 584 585 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 586 if (SGPRs <= 80) 587 return 10; 588 if (SGPRs <= 88) 589 return 9; 590 if (SGPRs <= 100) 591 return 8; 592 return 7; 593 } 594 if (SGPRs <= 48) 595 return 10; 596 if (SGPRs <= 56) 597 return 9; 598 if (SGPRs <= 64) 599 return 8; 600 if (SGPRs <= 72) 601 return 7; 602 if (SGPRs <= 80) 603 return 6; 604 return 5; 605 } 606 607 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 608 unsigned MaxWaves = getMaxWavesPerEU(); 609 unsigned Granule = getVGPRAllocGranule(); 610 if (VGPRs < Granule) 611 return MaxWaves; 612 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 613 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 614 } 615 616 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 617 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 618 if (getGeneration() >= AMDGPUSubtarget::GFX10) 619 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 620 621 if (MFI.hasFlatScratchInit()) { 622 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 623 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 624 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 625 return 4; // FLAT_SCRATCH, VCC (in that order). 626 } 627 628 if (isXNACKEnabled()) 629 return 4; // XNACK, VCC (in that order). 630 return 2; // VCC. 631 } 632 633 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 634 unsigned NumSGPRs, 635 unsigned NumVGPRs) const { 636 unsigned Occupancy = 637 std::min(getMaxWavesPerEU(), 638 getOccupancyWithLocalMemSize(LDSSize, F)); 639 if (NumSGPRs) 640 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 641 if (NumVGPRs) 642 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 643 return Occupancy; 644 } 645 646 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 647 const Function &F = MF.getFunction(); 648 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 649 650 // Compute maximum number of SGPRs function can use using default/requested 651 // minimum number of waves per execution unit. 652 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 653 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 654 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 655 656 // Check if maximum number of SGPRs was explicitly requested using 657 // "amdgpu-num-sgpr" attribute. 658 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 659 unsigned Requested = AMDGPU::getIntegerAttribute( 660 F, "amdgpu-num-sgpr", MaxNumSGPRs); 661 662 // Make sure requested value does not violate subtarget's specifications. 663 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 664 Requested = 0; 665 666 // If more SGPRs are required to support the input user/system SGPRs, 667 // increase to accommodate them. 668 // 669 // FIXME: This really ends up using the requested number of SGPRs + number 670 // of reserved special registers in total. Theoretically you could re-use 671 // the last input registers for these special registers, but this would 672 // require a lot of complexity to deal with the weird aliasing. 673 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 674 if (Requested && Requested < InputNumSGPRs) 675 Requested = InputNumSGPRs; 676 677 // Make sure requested value is compatible with values implied by 678 // default/requested minimum/maximum number of waves per execution unit. 679 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 680 Requested = 0; 681 if (WavesPerEU.second && 682 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 683 Requested = 0; 684 685 if (Requested) 686 MaxNumSGPRs = Requested; 687 } 688 689 if (hasSGPRInitBug()) 690 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 691 692 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 693 MaxAddressableNumSGPRs); 694 } 695 696 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 697 const Function &F = MF.getFunction(); 698 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 699 700 // Compute maximum number of VGPRs function can use using default/requested 701 // minimum number of waves per execution unit. 702 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 703 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 704 705 // Check if maximum number of VGPRs was explicitly requested using 706 // "amdgpu-num-vgpr" attribute. 707 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 708 unsigned Requested = AMDGPU::getIntegerAttribute( 709 F, "amdgpu-num-vgpr", MaxNumVGPRs); 710 711 // Make sure requested value is compatible with values implied by 712 // default/requested minimum/maximum number of waves per execution unit. 713 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 714 Requested = 0; 715 if (WavesPerEU.second && 716 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 717 Requested = 0; 718 719 if (Requested) 720 MaxNumVGPRs = Requested; 721 } 722 723 return MaxNumVGPRs; 724 } 725 726 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 727 int UseOpIdx, SDep &Dep) const { 728 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 729 !Def->isInstr() || !Use->isInstr()) 730 return; 731 732 MachineInstr *DefI = Def->getInstr(); 733 MachineInstr *UseI = Use->getInstr(); 734 735 if (DefI->isBundle()) { 736 const SIRegisterInfo *TRI = getRegisterInfo(); 737 auto Reg = Dep.getReg(); 738 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 739 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 740 unsigned Lat = 0; 741 for (++I; I != E && I->isBundledWithPred(); ++I) { 742 if (I->modifiesRegister(Reg, TRI)) 743 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 744 else if (Lat) 745 --Lat; 746 } 747 Dep.setLatency(Lat); 748 } else if (UseI->isBundle()) { 749 const SIRegisterInfo *TRI = getRegisterInfo(); 750 auto Reg = Dep.getReg(); 751 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 752 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 753 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 754 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 755 if (I->readsRegister(Reg, TRI)) 756 break; 757 --Lat; 758 } 759 Dep.setLatency(Lat); 760 } 761 } 762 763 namespace { 764 struct FillMFMAShadowMutation : ScheduleDAGMutation { 765 const SIInstrInfo *TII; 766 767 ScheduleDAGMI *DAG; 768 769 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 770 771 bool isSALU(const SUnit *SU) const { 772 const MachineInstr *MI = SU->getInstr(); 773 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 774 } 775 776 bool isVALU(const SUnit *SU) const { 777 const MachineInstr *MI = SU->getInstr(); 778 return MI && TII->isVALU(*MI); 779 } 780 781 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 782 if (Pred->NodeNum < Succ->NodeNum) 783 return true; 784 785 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 786 787 for (unsigned I = 0; I < Succs.size(); ++I) { 788 for (const SDep &SI : Succs[I]->Succs) { 789 const SUnit *SU = SI.getSUnit(); 790 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 791 Succs.push_back(SU); 792 } 793 } 794 795 SmallPtrSet<const SUnit*, 32> Visited; 796 while (!Preds.empty()) { 797 const SUnit *SU = Preds.pop_back_val(); 798 if (llvm::find(Succs, SU) != Succs.end()) 799 return false; 800 Visited.insert(SU); 801 for (const SDep &SI : SU->Preds) 802 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 803 Preds.push_back(SI.getSUnit()); 804 } 805 806 return true; 807 } 808 809 // Link as much SALU intructions in chain as possible. Return the size 810 // of the chain. Links up to MaxChain instructions. 811 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 812 SmallPtrSetImpl<SUnit *> &Visited) const { 813 SmallVector<SUnit *, 8> Worklist({To}); 814 unsigned Linked = 0; 815 816 while (!Worklist.empty() && MaxChain-- > 0) { 817 SUnit *SU = Worklist.pop_back_val(); 818 if (!Visited.insert(SU).second) 819 continue; 820 821 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 822 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 823 824 if (SU->addPred(SDep(From, SDep::Artificial), false)) 825 ++Linked; 826 827 for (SDep &SI : From->Succs) { 828 SUnit *SUv = SI.getSUnit(); 829 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 830 SUv->addPred(SDep(SU, SDep::Artificial), false); 831 } 832 833 for (SDep &SI : SU->Succs) { 834 SUnit *Succ = SI.getSUnit(); 835 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 836 Worklist.push_back(Succ); 837 } 838 } 839 840 return Linked; 841 } 842 843 void apply(ScheduleDAGInstrs *DAGInstrs) override { 844 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 845 if (!ST.hasMAIInsts() || DisablePowerSched) 846 return; 847 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 848 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 849 if (!TSchedModel || DAG->SUnits.empty()) 850 return; 851 852 // Scan for MFMA long latency instructions and try to add a dependency 853 // of available SALU instructions to give them a chance to fill MFMA 854 // shadow. That is desirable to fill MFMA shadow with SALU instructions 855 // rather than VALU to prevent power consumption bursts and throttle. 856 auto LastSALU = DAG->SUnits.begin(); 857 auto E = DAG->SUnits.end(); 858 SmallPtrSet<SUnit*, 32> Visited; 859 for (SUnit &SU : DAG->SUnits) { 860 MachineInstr &MAI = *SU.getInstr(); 861 if (!TII->isMAI(MAI) || 862 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 863 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 864 continue; 865 866 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 867 868 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 869 dbgs() << "Need " << Lat 870 << " instructions to cover latency.\n"); 871 872 // Find up to Lat independent scalar instructions as early as 873 // possible such that they can be scheduled after this MFMA. 874 for ( ; Lat && LastSALU != E; ++LastSALU) { 875 if (Visited.count(&*LastSALU)) 876 continue; 877 878 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 879 continue; 880 881 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 882 } 883 } 884 } 885 }; 886 } // namespace 887 888 void GCNSubtarget::getPostRAMutations( 889 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 890 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 891 } 892 893 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 894 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 895 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 896 else 897 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 898 } 899 900 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 901 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 902 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 903 else 904 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 905 } 906