1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 // FIXME: I don't think think Evergreen has any useful support for 63 // denormals, but should be checked. Should we issue a warning somewhere 64 // if someone tries to enable these? 65 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 66 FP32Denormals = false; 67 } 68 69 HasMulU24 = getGeneration() >= EVERGREEN; 70 HasMulI24 = hasCaymanISA(); 71 72 return *this; 73 } 74 75 GCNSubtarget & 76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 77 StringRef GPU, StringRef FS) { 78 // Determine default and user-specified characteristics 79 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 80 // enabled, but some instructions do not respect them and they run at the 81 // double precision rate, so don't enable by default. 82 // 83 // We want to be able to turn these off, but making this a subtarget feature 84 // for SI has the unhelpful behavior that it unsets everything else if you 85 // disable it. 86 // 87 // Similarly we want enable-prt-strict-null to be on by default and not to 88 // unset everything else if it is disabled 89 90 // Assuming ECC is enabled is the conservative default. 91 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 92 93 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 94 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 95 96 // FIXME: I don't think think Evergreen has any useful support for 97 // denormals, but should be checked. Should we issue a warning somewhere 98 // if someone tries to enable these? 99 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 100 FullFS += "+fp64-fp16-denormals,"; 101 } else { 102 FullFS += "-fp32-denormals,"; 103 } 104 105 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 106 107 // Disable mutually exclusive bits. 108 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 109 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 110 FullFS += "-wavefrontsize16,"; 111 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 112 FullFS += "-wavefrontsize32,"; 113 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 114 FullFS += "-wavefrontsize64,"; 115 } 116 117 FullFS += FS; 118 119 ParseSubtargetFeatures(GPU, FullFS); 120 121 // We don't support FP64 for EG/NI atm. 122 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 123 124 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 125 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 126 // variants of MUBUF instructions. 127 if (!hasAddr64() && !FS.contains("flat-for-global")) { 128 FlatForGlobal = true; 129 } 130 131 // Set defaults if needed. 132 if (MaxPrivateElementSize == 0) 133 MaxPrivateElementSize = 4; 134 135 if (LDSBankCount == 0) 136 LDSBankCount = 32; 137 138 if (TT.getArch() == Triple::amdgcn) { 139 if (LocalMemorySize == 0) 140 LocalMemorySize = 32768; 141 142 // Do something sensible for unspecified target. 143 if (!HasMovrel && !HasVGPRIndexMode) 144 HasMovrel = true; 145 } 146 147 // Don't crash on invalid devices. 148 if (WavefrontSize == 0) 149 WavefrontSize = 64; 150 151 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 152 153 if (DoesNotSupportXNACK && EnableXNACK) { 154 ToggleFeature(AMDGPU::FeatureXNACK); 155 EnableXNACK = false; 156 } 157 158 // ECC is on by default, but turn it off if the hardware doesn't support it 159 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 160 // ECC. 161 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 162 ToggleFeature(AMDGPU::FeatureSRAMECC); 163 EnableSRAMECC = false; 164 } 165 166 return *this; 167 } 168 169 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 170 TargetTriple(TT), 171 Has16BitInsts(false), 172 HasMadMixInsts(false), 173 FP32Denormals(false), 174 FPExceptions(false), 175 HasSDWA(false), 176 HasVOP3PInsts(false), 177 HasMulI24(true), 178 HasMulU24(true), 179 HasInv2PiInlineImm(false), 180 HasFminFmaxLegacy(true), 181 EnablePromoteAlloca(false), 182 HasTrigReducedRange(false), 183 MaxWavesPerEU(10), 184 LocalMemorySize(0), 185 WavefrontSize(0) 186 { } 187 188 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 189 const GCNTargetMachine &TM) : 190 AMDGPUGenSubtargetInfo(TT, GPU, FS), 191 AMDGPUSubtarget(TT), 192 TargetTriple(TT), 193 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 194 InstrItins(getInstrItineraryForCPU(GPU)), 195 LDSBankCount(0), 196 MaxPrivateElementSize(0), 197 198 FastFMAF32(false), 199 HalfRate64Ops(false), 200 201 FP64FP16Denormals(false), 202 FlatForGlobal(false), 203 AutoWaitcntBeforeBarrier(false), 204 CodeObjectV3(false), 205 UnalignedScratchAccess(false), 206 UnalignedBufferAccess(false), 207 208 HasApertureRegs(false), 209 EnableXNACK(false), 210 DoesNotSupportXNACK(false), 211 EnableCuMode(false), 212 TrapHandler(false), 213 214 EnableLoadStoreOpt(false), 215 EnableUnsafeDSOffsetFolding(false), 216 EnableSIScheduler(false), 217 EnableDS128(false), 218 EnablePRTStrictNull(false), 219 DumpCode(false), 220 221 FP64(false), 222 GCN3Encoding(false), 223 CIInsts(false), 224 GFX8Insts(false), 225 GFX9Insts(false), 226 GFX10Insts(false), 227 GFX7GFX8GFX9Insts(false), 228 SGPRInitBug(false), 229 HasSMemRealTime(false), 230 HasIntClamp(false), 231 HasFmaMixInsts(false), 232 HasMovrel(false), 233 HasVGPRIndexMode(false), 234 HasScalarStores(false), 235 HasScalarAtomics(false), 236 HasSDWAOmod(false), 237 HasSDWAScalar(false), 238 HasSDWASdst(false), 239 HasSDWAMac(false), 240 HasSDWAOutModsVOPC(false), 241 HasDPP(false), 242 HasDPP8(false), 243 HasR128A16(false), 244 HasNSAEncoding(false), 245 HasDLInsts(false), 246 HasDot1Insts(false), 247 HasDot2Insts(false), 248 HasDot3Insts(false), 249 HasDot4Insts(false), 250 HasDot5Insts(false), 251 HasDot6Insts(false), 252 HasMAIInsts(false), 253 HasPkFmacF16Inst(false), 254 HasAtomicFaddInsts(false), 255 EnableSRAMECC(false), 256 DoesNotSupportSRAMECC(false), 257 HasNoSdstCMPX(false), 258 HasVscnt(false), 259 HasRegisterBanking(false), 260 HasVOP3Literal(false), 261 HasNoDataDepHazard(false), 262 FlatAddressSpace(false), 263 FlatInstOffsets(false), 264 FlatGlobalInsts(false), 265 FlatScratchInsts(false), 266 ScalarFlatScratchInsts(false), 267 AddNoCarryInsts(false), 268 HasUnpackedD16VMem(false), 269 LDSMisalignedBug(false), 270 HasMFMAInlineLiteralBug(false), 271 272 ScalarizeGlobal(false), 273 274 HasVcmpxPermlaneHazard(false), 275 HasVMEMtoScalarWriteHazard(false), 276 HasSMEMtoVectorWriteHazard(false), 277 HasInstFwdPrefetchBug(false), 278 HasVcmpxExecWARHazard(false), 279 HasLdsBranchVmemWARHazard(false), 280 HasNSAtoVMEMBug(false), 281 HasOffset3fBug(false), 282 HasFlatSegmentOffsetBug(false), 283 284 FeatureDisable(false), 285 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 286 TLInfo(TM, *this), 287 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 288 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 289 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 290 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 291 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 292 InstSelector.reset(new AMDGPUInstructionSelector( 293 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 294 } 295 296 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 297 if (getGeneration() < GFX10) 298 return 1; 299 300 switch (Opcode) { 301 case AMDGPU::V_LSHLREV_B64: 302 case AMDGPU::V_LSHLREV_B64_gfx10: 303 case AMDGPU::V_LSHL_B64: 304 case AMDGPU::V_LSHRREV_B64: 305 case AMDGPU::V_LSHRREV_B64_gfx10: 306 case AMDGPU::V_LSHR_B64: 307 case AMDGPU::V_ASHRREV_I64: 308 case AMDGPU::V_ASHRREV_I64_gfx10: 309 case AMDGPU::V_ASHR_I64: 310 return 1; 311 } 312 313 return 2; 314 } 315 316 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 317 const Function &F) const { 318 if (NWaves == 1) 319 return getLocalMemorySize(); 320 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 321 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 322 if (!WorkGroupsPerCu) 323 return 0; 324 unsigned MaxWaves = getMaxWavesPerEU(); 325 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 326 } 327 328 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 329 const Function &F) const { 330 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 331 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 332 if (!WorkGroupsPerCu) 333 return 0; 334 unsigned MaxWaves = getMaxWavesPerEU(); 335 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 336 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 337 NumWaves = std::min(NumWaves, MaxWaves); 338 NumWaves = std::max(NumWaves, 1u); 339 return NumWaves; 340 } 341 342 unsigned 343 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 344 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 345 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 346 } 347 348 std::pair<unsigned, unsigned> 349 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 350 switch (CC) { 351 case CallingConv::AMDGPU_VS: 352 case CallingConv::AMDGPU_LS: 353 case CallingConv::AMDGPU_HS: 354 case CallingConv::AMDGPU_ES: 355 case CallingConv::AMDGPU_GS: 356 case CallingConv::AMDGPU_PS: 357 return std::make_pair(1, getWavefrontSize()); 358 default: 359 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 360 } 361 } 362 363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 364 const Function &F) const { 365 // Default minimum/maximum flat work group sizes. 366 std::pair<unsigned, unsigned> Default = 367 getDefaultFlatWorkGroupSize(F.getCallingConv()); 368 369 // Requested minimum/maximum flat work group sizes. 370 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 371 F, "amdgpu-flat-work-group-size", Default); 372 373 // Make sure requested minimum is less than requested maximum. 374 if (Requested.first > Requested.second) 375 return Default; 376 377 // Make sure requested values do not violate subtarget's specifications. 378 if (Requested.first < getMinFlatWorkGroupSize()) 379 return Default; 380 if (Requested.second > getMaxFlatWorkGroupSize()) 381 return Default; 382 383 return Requested; 384 } 385 386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 387 const Function &F) const { 388 // Default minimum/maximum number of waves per execution unit. 389 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 390 391 // Default/requested minimum/maximum flat work group sizes. 392 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 393 394 // If minimum/maximum flat work group sizes were explicitly requested using 395 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 396 // number of waves per execution unit to values implied by requested 397 // minimum/maximum flat work group sizes. 398 unsigned MinImpliedByFlatWorkGroupSize = 399 getMaxWavesPerEU(FlatWorkGroupSizes.second); 400 bool RequestedFlatWorkGroupSize = false; 401 402 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 403 Default.first = MinImpliedByFlatWorkGroupSize; 404 RequestedFlatWorkGroupSize = true; 405 } 406 407 // Requested minimum/maximum number of waves per execution unit. 408 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 409 F, "amdgpu-waves-per-eu", Default, true); 410 411 // Make sure requested minimum is less than requested maximum. 412 if (Requested.second && Requested.first > Requested.second) 413 return Default; 414 415 // Make sure requested values do not violate subtarget's specifications. 416 if (Requested.first < getMinWavesPerEU() || 417 Requested.first > getMaxWavesPerEU()) 418 return Default; 419 if (Requested.second > getMaxWavesPerEU()) 420 return Default; 421 422 // Make sure requested values are compatible with values implied by requested 423 // minimum/maximum flat work group sizes. 424 if (RequestedFlatWorkGroupSize && 425 Requested.first < MinImpliedByFlatWorkGroupSize) 426 return Default; 427 428 return Requested; 429 } 430 431 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 432 Function *Kernel = I->getParent()->getParent(); 433 unsigned MinSize = 0; 434 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 435 bool IdQuery = false; 436 437 // If reqd_work_group_size is present it narrows value down. 438 if (auto *CI = dyn_cast<CallInst>(I)) { 439 const Function *F = CI->getCalledFunction(); 440 if (F) { 441 unsigned Dim = UINT_MAX; 442 switch (F->getIntrinsicID()) { 443 case Intrinsic::amdgcn_workitem_id_x: 444 case Intrinsic::r600_read_tidig_x: 445 IdQuery = true; 446 LLVM_FALLTHROUGH; 447 case Intrinsic::r600_read_local_size_x: 448 Dim = 0; 449 break; 450 case Intrinsic::amdgcn_workitem_id_y: 451 case Intrinsic::r600_read_tidig_y: 452 IdQuery = true; 453 LLVM_FALLTHROUGH; 454 case Intrinsic::r600_read_local_size_y: 455 Dim = 1; 456 break; 457 case Intrinsic::amdgcn_workitem_id_z: 458 case Intrinsic::r600_read_tidig_z: 459 IdQuery = true; 460 LLVM_FALLTHROUGH; 461 case Intrinsic::r600_read_local_size_z: 462 Dim = 2; 463 break; 464 default: 465 break; 466 } 467 if (Dim <= 3) { 468 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 469 if (Node->getNumOperands() == 3) 470 MinSize = MaxSize = mdconst::extract<ConstantInt>( 471 Node->getOperand(Dim))->getZExtValue(); 472 } 473 } 474 } 475 476 if (!MaxSize) 477 return false; 478 479 // Range metadata is [Lo, Hi). For ID query we need to pass max size 480 // as Hi. For size query we need to pass Hi + 1. 481 if (IdQuery) 482 MinSize = 0; 483 else 484 ++MaxSize; 485 486 MDBuilder MDB(I->getContext()); 487 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 488 APInt(32, MaxSize)); 489 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 490 return true; 491 } 492 493 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 494 Align &MaxAlign) const { 495 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 496 F.getCallingConv() == CallingConv::SPIR_KERNEL); 497 498 const DataLayout &DL = F.getParent()->getDataLayout(); 499 uint64_t ExplicitArgBytes = 0; 500 MaxAlign = Align::None(); 501 502 for (const Argument &Arg : F.args()) { 503 Type *ArgTy = Arg.getType(); 504 505 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 506 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 507 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 508 MaxAlign = std::max(MaxAlign, Alignment); 509 } 510 511 return ExplicitArgBytes; 512 } 513 514 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 515 Align &MaxAlign) const { 516 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 517 518 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 519 520 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 521 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 522 if (ImplicitBytes != 0) { 523 const Align Alignment = getAlignmentForImplicitArgPtr(); 524 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 525 } 526 527 // Being able to dereference past the end is useful for emitting scalar loads. 528 return alignTo(TotalSize, 4); 529 } 530 531 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 532 const TargetMachine &TM) : 533 R600GenSubtargetInfo(TT, GPU, FS), 534 AMDGPUSubtarget(TT), 535 InstrInfo(*this), 536 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 537 FMA(false), 538 CaymanISA(false), 539 CFALUBug(false), 540 HasVertexCache(false), 541 R600ALUInst(false), 542 FP64(false), 543 TexVTXClauseSize(0), 544 Gen(R600), 545 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 546 InstrItins(getInstrItineraryForCPU(GPU)) { } 547 548 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 549 unsigned NumRegionInstrs) const { 550 // Track register pressure so the scheduler can try to decrease 551 // pressure once register usage is above the threshold defined by 552 // SIRegisterInfo::getRegPressureSetLimit() 553 Policy.ShouldTrackPressure = true; 554 555 // Enabling both top down and bottom up scheduling seems to give us less 556 // register spills than just using one of these approaches on its own. 557 Policy.OnlyTopDown = false; 558 Policy.OnlyBottomUp = false; 559 560 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 561 if (!enableSIScheduler()) 562 Policy.ShouldTrackLaneMasks = true; 563 } 564 565 bool GCNSubtarget::hasMadF16() const { 566 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 567 } 568 569 bool GCNSubtarget::useVGPRIndexMode() const { 570 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 571 } 572 573 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 574 if (getGeneration() >= AMDGPUSubtarget::GFX10) 575 return getMaxWavesPerEU(); 576 577 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 578 if (SGPRs <= 80) 579 return 10; 580 if (SGPRs <= 88) 581 return 9; 582 if (SGPRs <= 100) 583 return 8; 584 return 7; 585 } 586 if (SGPRs <= 48) 587 return 10; 588 if (SGPRs <= 56) 589 return 9; 590 if (SGPRs <= 64) 591 return 8; 592 if (SGPRs <= 72) 593 return 7; 594 if (SGPRs <= 80) 595 return 6; 596 return 5; 597 } 598 599 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 600 unsigned MaxWaves = getMaxWavesPerEU(); 601 unsigned Granule = getVGPRAllocGranule(); 602 if (VGPRs < Granule) 603 return MaxWaves; 604 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 605 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 606 } 607 608 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 609 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 610 if (getGeneration() >= AMDGPUSubtarget::GFX10) 611 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 612 613 if (MFI.hasFlatScratchInit()) { 614 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 615 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 616 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 617 return 4; // FLAT_SCRATCH, VCC (in that order). 618 } 619 620 if (isXNACKEnabled()) 621 return 4; // XNACK, VCC (in that order). 622 return 2; // VCC. 623 } 624 625 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 626 unsigned LDSSize, 627 unsigned NumSGPRs, 628 unsigned NumVGPRs) const { 629 unsigned Occupancy = 630 std::min(getMaxWavesPerEU(), 631 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 632 if (NumSGPRs) 633 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 634 if (NumVGPRs) 635 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 636 return Occupancy; 637 } 638 639 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 640 const Function &F = MF.getFunction(); 641 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 642 643 // Compute maximum number of SGPRs function can use using default/requested 644 // minimum number of waves per execution unit. 645 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 646 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 647 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 648 649 // Check if maximum number of SGPRs was explicitly requested using 650 // "amdgpu-num-sgpr" attribute. 651 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 652 unsigned Requested = AMDGPU::getIntegerAttribute( 653 F, "amdgpu-num-sgpr", MaxNumSGPRs); 654 655 // Make sure requested value does not violate subtarget's specifications. 656 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 657 Requested = 0; 658 659 // If more SGPRs are required to support the input user/system SGPRs, 660 // increase to accommodate them. 661 // 662 // FIXME: This really ends up using the requested number of SGPRs + number 663 // of reserved special registers in total. Theoretically you could re-use 664 // the last input registers for these special registers, but this would 665 // require a lot of complexity to deal with the weird aliasing. 666 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 667 if (Requested && Requested < InputNumSGPRs) 668 Requested = InputNumSGPRs; 669 670 // Make sure requested value is compatible with values implied by 671 // default/requested minimum/maximum number of waves per execution unit. 672 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 673 Requested = 0; 674 if (WavesPerEU.second && 675 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 676 Requested = 0; 677 678 if (Requested) 679 MaxNumSGPRs = Requested; 680 } 681 682 if (hasSGPRInitBug()) 683 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 684 685 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 686 MaxAddressableNumSGPRs); 687 } 688 689 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 690 const Function &F = MF.getFunction(); 691 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 692 693 // Compute maximum number of VGPRs function can use using default/requested 694 // minimum number of waves per execution unit. 695 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 696 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 697 698 // Check if maximum number of VGPRs was explicitly requested using 699 // "amdgpu-num-vgpr" attribute. 700 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 701 unsigned Requested = AMDGPU::getIntegerAttribute( 702 F, "amdgpu-num-vgpr", MaxNumVGPRs); 703 704 // Make sure requested value is compatible with values implied by 705 // default/requested minimum/maximum number of waves per execution unit. 706 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 707 Requested = 0; 708 if (WavesPerEU.second && 709 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 710 Requested = 0; 711 712 if (Requested) 713 MaxNumVGPRs = Requested; 714 } 715 716 return MaxNumVGPRs; 717 } 718 719 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, 720 SDep &Dep) const { 721 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 722 !Src->isInstr() || !Dst->isInstr()) 723 return; 724 725 MachineInstr *SrcI = Src->getInstr(); 726 MachineInstr *DstI = Dst->getInstr(); 727 728 if (SrcI->isBundle()) { 729 const SIRegisterInfo *TRI = getRegisterInfo(); 730 auto Reg = Dep.getReg(); 731 MachineBasicBlock::const_instr_iterator I(SrcI->getIterator()); 732 MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end()); 733 unsigned Lat = 0; 734 for (++I; I != E && I->isBundledWithPred(); ++I) { 735 if (I->modifiesRegister(Reg, TRI)) 736 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 737 else if (Lat) 738 --Lat; 739 } 740 Dep.setLatency(Lat); 741 } else if (DstI->isBundle()) { 742 const SIRegisterInfo *TRI = getRegisterInfo(); 743 auto Reg = Dep.getReg(); 744 MachineBasicBlock::const_instr_iterator I(DstI->getIterator()); 745 MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end()); 746 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI); 747 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 748 if (I->readsRegister(Reg, TRI)) 749 break; 750 --Lat; 751 } 752 Dep.setLatency(Lat); 753 } 754 } 755 756 namespace { 757 struct MemOpClusterMutation : ScheduleDAGMutation { 758 const SIInstrInfo *TII; 759 760 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 761 762 void apply(ScheduleDAGInstrs *DAG) override { 763 SUnit *SUa = nullptr; 764 // Search for two consequent memory operations and link them 765 // to prevent scheduler from moving them apart. 766 // In DAG pre-process SUnits are in the original order of 767 // the instructions before scheduling. 768 for (SUnit &SU : DAG->SUnits) { 769 MachineInstr &MI2 = *SU.getInstr(); 770 if (!MI2.mayLoad() && !MI2.mayStore()) { 771 SUa = nullptr; 772 continue; 773 } 774 if (!SUa) { 775 SUa = &SU; 776 continue; 777 } 778 779 MachineInstr &MI1 = *SUa->getInstr(); 780 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 781 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 782 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 783 (TII->isDS(MI1) && TII->isDS(MI2))) { 784 SU.addPredBarrier(SUa); 785 786 for (const SDep &SI : SU.Preds) { 787 if (SI.getSUnit() != SUa) 788 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 789 } 790 791 if (&SU != &DAG->ExitSU) { 792 for (const SDep &SI : SUa->Succs) { 793 if (SI.getSUnit() != &SU) 794 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 795 } 796 } 797 } 798 799 SUa = &SU; 800 } 801 } 802 }; 803 804 struct FillMFMAShadowMutation : ScheduleDAGMutation { 805 const SIInstrInfo *TII; 806 807 ScheduleDAGMI *DAG; 808 809 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 810 811 bool isSALU(const SUnit *SU) const { 812 const MachineInstr *MI = SU->getInstr(); 813 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 814 } 815 816 bool isVALU(const SUnit *SU) const { 817 const MachineInstr *MI = SU->getInstr(); 818 return MI && TII->isVALU(*MI); 819 } 820 821 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 822 if (Pred->NodeNum < Succ->NodeNum) 823 return true; 824 825 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 826 827 for (unsigned I = 0; I < Succs.size(); ++I) { 828 for (const SDep &SI : Succs[I]->Succs) { 829 const SUnit *SU = SI.getSUnit(); 830 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 831 Succs.push_back(SU); 832 } 833 } 834 835 SmallPtrSet<const SUnit*, 32> Visited; 836 while (!Preds.empty()) { 837 const SUnit *SU = Preds.pop_back_val(); 838 if (llvm::find(Succs, SU) != Succs.end()) 839 return false; 840 Visited.insert(SU); 841 for (const SDep &SI : SU->Preds) 842 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 843 Preds.push_back(SI.getSUnit()); 844 } 845 846 return true; 847 } 848 849 // Link as much SALU intructions in chain as possible. Return the size 850 // of the chain. Links up to MaxChain instructions. 851 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 852 SmallPtrSetImpl<SUnit *> &Visited) const { 853 SmallVector<SUnit *, 8> Worklist({To}); 854 unsigned Linked = 0; 855 856 while (!Worklist.empty() && MaxChain-- > 0) { 857 SUnit *SU = Worklist.pop_back_val(); 858 if (!Visited.insert(SU).second) 859 continue; 860 861 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 862 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 863 864 if (SU->addPred(SDep(From, SDep::Artificial), false)) 865 ++Linked; 866 867 for (SDep &SI : From->Succs) { 868 SUnit *SUv = SI.getSUnit(); 869 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 870 SUv->addPred(SDep(SU, SDep::Artificial), false); 871 } 872 873 for (SDep &SI : SU->Succs) { 874 SUnit *Succ = SI.getSUnit(); 875 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 876 Worklist.push_back(Succ); 877 } 878 } 879 880 return Linked; 881 } 882 883 void apply(ScheduleDAGInstrs *DAGInstrs) override { 884 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 885 if (!ST.hasMAIInsts() || DisablePowerSched) 886 return; 887 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 888 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 889 if (!TSchedModel || DAG->SUnits.empty()) 890 return; 891 892 // Scan for MFMA long latency instructions and try to add a dependency 893 // of available SALU instructions to give them a chance to fill MFMA 894 // shadow. That is desirable to fill MFMA shadow with SALU instructions 895 // rather than VALU to prevent power consumption bursts and throttle. 896 auto LastSALU = DAG->SUnits.begin(); 897 auto E = DAG->SUnits.end(); 898 SmallPtrSet<SUnit*, 32> Visited; 899 for (SUnit &SU : DAG->SUnits) { 900 MachineInstr &MAI = *SU.getInstr(); 901 if (!TII->isMAI(MAI) || 902 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 903 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 904 continue; 905 906 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 907 908 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 909 dbgs() << "Need " << Lat 910 << " instructions to cover latency.\n"); 911 912 // Find up to Lat independent scalar instructions as early as 913 // possible such that they can be scheduled after this MFMA. 914 for ( ; Lat && LastSALU != E; ++LastSALU) { 915 if (Visited.count(&*LastSALU)) 916 continue; 917 918 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 919 continue; 920 921 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 922 } 923 } 924 } 925 }; 926 } // namespace 927 928 void GCNSubtarget::getPostRAMutations( 929 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 930 Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo)); 931 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 932 } 933 934 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 935 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 936 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 937 else 938 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 939 } 940 941 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 942 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 943 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 944 else 945 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 946 } 947