1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 GCNSubtarget::~GCNSubtarget() = default; 49 50 R600Subtarget & 51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 52 StringRef GPU, StringRef FS) { 53 SmallString<256> FullFS("+promote-alloca,"); 54 FullFS += FS; 55 ParseSubtargetFeatures(GPU, FullFS); 56 57 // FIXME: I don't think think Evergreen has any useful support for 58 // denormals, but should be checked. Should we issue a warning somewhere 59 // if someone tries to enable these? 60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 61 FP32Denormals = false; 62 } 63 64 HasMulU24 = getGeneration() >= EVERGREEN; 65 HasMulI24 = hasCaymanISA(); 66 67 return *this; 68 } 69 70 GCNSubtarget & 71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 72 StringRef GPU, StringRef FS) { 73 // Determine default and user-specified characteristics 74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 75 // enabled, but some instructions do not respect them and they run at the 76 // double precision rate, so don't enable by default. 77 // 78 // We want to be able to turn these off, but making this a subtarget feature 79 // for SI has the unhelpful behavior that it unsets everything else if you 80 // disable it. 81 // 82 // Similarly we want enable-prt-strict-null to be on by default and not to 83 // unset everything else if it is disabled 84 85 // Assuming ECC is enabled is the conservative default. 86 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 87 88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 89 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 90 91 // FIXME: I don't think think Evergreen has any useful support for 92 // denormals, but should be checked. Should we issue a warning somewhere 93 // if someone tries to enable these? 94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 95 FullFS += "+fp64-fp16-denormals,"; 96 } else { 97 FullFS += "-fp32-denormals,"; 98 } 99 100 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 101 102 // Disable mutually exclusive bits. 103 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 104 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 105 FullFS += "-wavefrontsize16,"; 106 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 107 FullFS += "-wavefrontsize32,"; 108 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 109 FullFS += "-wavefrontsize64,"; 110 } 111 112 FullFS += FS; 113 114 ParseSubtargetFeatures(GPU, FullFS); 115 116 // We don't support FP64 for EG/NI atm. 117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 118 119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 121 // variants of MUBUF instructions. 122 if (!hasAddr64() && !FS.contains("flat-for-global")) { 123 FlatForGlobal = true; 124 } 125 126 // Set defaults if needed. 127 if (MaxPrivateElementSize == 0) 128 MaxPrivateElementSize = 4; 129 130 if (LDSBankCount == 0) 131 LDSBankCount = 32; 132 133 if (TT.getArch() == Triple::amdgcn) { 134 if (LocalMemorySize == 0) 135 LocalMemorySize = 32768; 136 137 // Do something sensible for unspecified target. 138 if (!HasMovrel && !HasVGPRIndexMode) 139 HasMovrel = true; 140 } 141 142 // Don't crash on invalid devices. 143 if (WavefrontSize == 0) 144 WavefrontSize = 64; 145 146 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 147 148 if (DoesNotSupportXNACK && EnableXNACK) { 149 ToggleFeature(AMDGPU::FeatureXNACK); 150 EnableXNACK = false; 151 } 152 153 // ECC is on by default, but turn it off if the hardware doesn't support it 154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 155 // ECC. 156 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 157 ToggleFeature(AMDGPU::FeatureSRAMECC); 158 EnableSRAMECC = false; 159 } 160 161 return *this; 162 } 163 164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 165 TargetTriple(TT), 166 Has16BitInsts(false), 167 HasMadMixInsts(false), 168 FP32Denormals(false), 169 FPExceptions(false), 170 HasSDWA(false), 171 HasVOP3PInsts(false), 172 HasMulI24(true), 173 HasMulU24(true), 174 HasInv2PiInlineImm(false), 175 HasFminFmaxLegacy(true), 176 EnablePromoteAlloca(false), 177 HasTrigReducedRange(false), 178 MaxWavesPerEU(10), 179 LocalMemorySize(0), 180 WavefrontSize(0) 181 { } 182 183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 184 const GCNTargetMachine &TM) : 185 AMDGPUGenSubtargetInfo(TT, GPU, FS), 186 AMDGPUSubtarget(TT), 187 TargetTriple(TT), 188 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 189 InstrItins(getInstrItineraryForCPU(GPU)), 190 LDSBankCount(0), 191 MaxPrivateElementSize(0), 192 193 FastFMAF32(false), 194 HalfRate64Ops(false), 195 196 FP64FP16Denormals(false), 197 FlatForGlobal(false), 198 AutoWaitcntBeforeBarrier(false), 199 CodeObjectV3(false), 200 UnalignedScratchAccess(false), 201 UnalignedBufferAccess(false), 202 203 HasApertureRegs(false), 204 EnableXNACK(false), 205 DoesNotSupportXNACK(false), 206 EnableCuMode(false), 207 TrapHandler(false), 208 209 EnableLoadStoreOpt(false), 210 EnableUnsafeDSOffsetFolding(false), 211 EnableSIScheduler(false), 212 EnableDS128(false), 213 EnablePRTStrictNull(false), 214 DumpCode(false), 215 216 FP64(false), 217 GCN3Encoding(false), 218 CIInsts(false), 219 GFX8Insts(false), 220 GFX9Insts(false), 221 GFX10Insts(false), 222 GFX7GFX8GFX9Insts(false), 223 SGPRInitBug(false), 224 HasSMemRealTime(false), 225 HasIntClamp(false), 226 HasFmaMixInsts(false), 227 HasMovrel(false), 228 HasVGPRIndexMode(false), 229 HasScalarStores(false), 230 HasScalarAtomics(false), 231 HasSDWAOmod(false), 232 HasSDWAScalar(false), 233 HasSDWASdst(false), 234 HasSDWAMac(false), 235 HasSDWAOutModsVOPC(false), 236 HasDPP(false), 237 HasDPP8(false), 238 HasR128A16(false), 239 HasNSAEncoding(false), 240 HasDLInsts(false), 241 HasDot1Insts(false), 242 HasDot2Insts(false), 243 HasDot3Insts(false), 244 HasDot4Insts(false), 245 HasDot5Insts(false), 246 HasDot6Insts(false), 247 HasMAIInsts(false), 248 HasPkFmacF16Inst(false), 249 HasAtomicFaddInsts(false), 250 EnableSRAMECC(false), 251 DoesNotSupportSRAMECC(false), 252 HasNoSdstCMPX(false), 253 HasVscnt(false), 254 HasRegisterBanking(false), 255 HasVOP3Literal(false), 256 HasNoDataDepHazard(false), 257 FlatAddressSpace(false), 258 FlatInstOffsets(false), 259 FlatGlobalInsts(false), 260 FlatScratchInsts(false), 261 ScalarFlatScratchInsts(false), 262 AddNoCarryInsts(false), 263 HasUnpackedD16VMem(false), 264 LDSMisalignedBug(false), 265 HasMFMAInlineLiteralBug(false), 266 267 ScalarizeGlobal(false), 268 269 HasVcmpxPermlaneHazard(false), 270 HasVMEMtoScalarWriteHazard(false), 271 HasSMEMtoVectorWriteHazard(false), 272 HasInstFwdPrefetchBug(false), 273 HasVcmpxExecWARHazard(false), 274 HasLdsBranchVmemWARHazard(false), 275 HasNSAtoVMEMBug(false), 276 HasOffset3fBug(false), 277 HasFlatSegmentOffsetBug(false), 278 279 FeatureDisable(false), 280 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 281 TLInfo(TM, *this), 282 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 283 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 284 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 285 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 286 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 287 InstSelector.reset(new AMDGPUInstructionSelector( 288 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 289 } 290 291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 292 if (getGeneration() < GFX10) 293 return 1; 294 295 switch (Opcode) { 296 case AMDGPU::V_LSHLREV_B64: 297 case AMDGPU::V_LSHLREV_B64_gfx10: 298 case AMDGPU::V_LSHL_B64: 299 case AMDGPU::V_LSHRREV_B64: 300 case AMDGPU::V_LSHRREV_B64_gfx10: 301 case AMDGPU::V_LSHR_B64: 302 case AMDGPU::V_ASHRREV_I64: 303 case AMDGPU::V_ASHRREV_I64_gfx10: 304 case AMDGPU::V_ASHR_I64: 305 return 1; 306 } 307 308 return 2; 309 } 310 311 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 312 const Function &F) const { 313 if (NWaves == 1) 314 return getLocalMemorySize(); 315 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 316 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 317 if (!WorkGroupsPerCu) 318 return 0; 319 unsigned MaxWaves = getMaxWavesPerEU(); 320 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 321 } 322 323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 324 const Function &F) const { 325 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 326 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 327 if (!WorkGroupsPerCu) 328 return 0; 329 unsigned MaxWaves = getMaxWavesPerEU(); 330 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 331 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 332 NumWaves = std::min(NumWaves, MaxWaves); 333 NumWaves = std::max(NumWaves, 1u); 334 return NumWaves; 335 } 336 337 unsigned 338 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 339 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 340 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 341 } 342 343 std::pair<unsigned, unsigned> 344 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 345 switch (CC) { 346 case CallingConv::AMDGPU_CS: 347 case CallingConv::AMDGPU_KERNEL: 348 case CallingConv::SPIR_KERNEL: 349 return std::make_pair(getWavefrontSize() * 2, 350 std::max(getWavefrontSize() * 4, 256u)); 351 case CallingConv::AMDGPU_VS: 352 case CallingConv::AMDGPU_LS: 353 case CallingConv::AMDGPU_HS: 354 case CallingConv::AMDGPU_ES: 355 case CallingConv::AMDGPU_GS: 356 case CallingConv::AMDGPU_PS: 357 return std::make_pair(1, getWavefrontSize()); 358 default: 359 return std::make_pair(1, 16 * getWavefrontSize()); 360 } 361 } 362 363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 364 const Function &F) const { 365 // FIXME: 1024 if function. 366 // Default minimum/maximum flat work group sizes. 367 std::pair<unsigned, unsigned> Default = 368 getDefaultFlatWorkGroupSize(F.getCallingConv()); 369 370 // Requested minimum/maximum flat work group sizes. 371 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 372 F, "amdgpu-flat-work-group-size", Default); 373 374 // Make sure requested minimum is less than requested maximum. 375 if (Requested.first > Requested.second) 376 return Default; 377 378 // Make sure requested values do not violate subtarget's specifications. 379 if (Requested.first < getMinFlatWorkGroupSize()) 380 return Default; 381 if (Requested.second > getMaxFlatWorkGroupSize()) 382 return Default; 383 384 return Requested; 385 } 386 387 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 388 const Function &F) const { 389 // Default minimum/maximum number of waves per execution unit. 390 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 391 392 // Default/requested minimum/maximum flat work group sizes. 393 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 394 395 // If minimum/maximum flat work group sizes were explicitly requested using 396 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 397 // number of waves per execution unit to values implied by requested 398 // minimum/maximum flat work group sizes. 399 unsigned MinImpliedByFlatWorkGroupSize = 400 getMaxWavesPerEU(FlatWorkGroupSizes.second); 401 bool RequestedFlatWorkGroupSize = false; 402 403 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 404 Default.first = MinImpliedByFlatWorkGroupSize; 405 RequestedFlatWorkGroupSize = true; 406 } 407 408 // Requested minimum/maximum number of waves per execution unit. 409 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 410 F, "amdgpu-waves-per-eu", Default, true); 411 412 // Make sure requested minimum is less than requested maximum. 413 if (Requested.second && Requested.first > Requested.second) 414 return Default; 415 416 // Make sure requested values do not violate subtarget's specifications. 417 if (Requested.first < getMinWavesPerEU() || 418 Requested.first > getMaxWavesPerEU()) 419 return Default; 420 if (Requested.second > getMaxWavesPerEU()) 421 return Default; 422 423 // Make sure requested values are compatible with values implied by requested 424 // minimum/maximum flat work group sizes. 425 if (RequestedFlatWorkGroupSize && 426 Requested.first < MinImpliedByFlatWorkGroupSize) 427 return Default; 428 429 return Requested; 430 } 431 432 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 433 Function *Kernel = I->getParent()->getParent(); 434 unsigned MinSize = 0; 435 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 436 bool IdQuery = false; 437 438 // If reqd_work_group_size is present it narrows value down. 439 if (auto *CI = dyn_cast<CallInst>(I)) { 440 const Function *F = CI->getCalledFunction(); 441 if (F) { 442 unsigned Dim = UINT_MAX; 443 switch (F->getIntrinsicID()) { 444 case Intrinsic::amdgcn_workitem_id_x: 445 case Intrinsic::r600_read_tidig_x: 446 IdQuery = true; 447 LLVM_FALLTHROUGH; 448 case Intrinsic::r600_read_local_size_x: 449 Dim = 0; 450 break; 451 case Intrinsic::amdgcn_workitem_id_y: 452 case Intrinsic::r600_read_tidig_y: 453 IdQuery = true; 454 LLVM_FALLTHROUGH; 455 case Intrinsic::r600_read_local_size_y: 456 Dim = 1; 457 break; 458 case Intrinsic::amdgcn_workitem_id_z: 459 case Intrinsic::r600_read_tidig_z: 460 IdQuery = true; 461 LLVM_FALLTHROUGH; 462 case Intrinsic::r600_read_local_size_z: 463 Dim = 2; 464 break; 465 default: 466 break; 467 } 468 if (Dim <= 3) { 469 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 470 if (Node->getNumOperands() == 3) 471 MinSize = MaxSize = mdconst::extract<ConstantInt>( 472 Node->getOperand(Dim))->getZExtValue(); 473 } 474 } 475 } 476 477 if (!MaxSize) 478 return false; 479 480 // Range metadata is [Lo, Hi). For ID query we need to pass max size 481 // as Hi. For size query we need to pass Hi + 1. 482 if (IdQuery) 483 MinSize = 0; 484 else 485 ++MaxSize; 486 487 MDBuilder MDB(I->getContext()); 488 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 489 APInt(32, MaxSize)); 490 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 491 return true; 492 } 493 494 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 495 Align &MaxAlign) const { 496 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 497 F.getCallingConv() == CallingConv::SPIR_KERNEL); 498 499 const DataLayout &DL = F.getParent()->getDataLayout(); 500 uint64_t ExplicitArgBytes = 0; 501 MaxAlign = Align::None(); 502 503 for (const Argument &Arg : F.args()) { 504 Type *ArgTy = Arg.getType(); 505 506 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 507 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 508 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 509 MaxAlign = std::max(MaxAlign, Alignment); 510 } 511 512 return ExplicitArgBytes; 513 } 514 515 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 516 Align &MaxAlign) const { 517 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 518 519 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 520 521 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 522 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 523 if (ImplicitBytes != 0) { 524 const Align Alignment = getAlignmentForImplicitArgPtr(); 525 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 526 } 527 528 // Being able to dereference past the end is useful for emitting scalar loads. 529 return alignTo(TotalSize, 4); 530 } 531 532 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 533 const TargetMachine &TM) : 534 R600GenSubtargetInfo(TT, GPU, FS), 535 AMDGPUSubtarget(TT), 536 InstrInfo(*this), 537 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 538 FMA(false), 539 CaymanISA(false), 540 CFALUBug(false), 541 HasVertexCache(false), 542 R600ALUInst(false), 543 FP64(false), 544 TexVTXClauseSize(0), 545 Gen(R600), 546 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 547 InstrItins(getInstrItineraryForCPU(GPU)) { } 548 549 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 550 unsigned NumRegionInstrs) const { 551 // Track register pressure so the scheduler can try to decrease 552 // pressure once register usage is above the threshold defined by 553 // SIRegisterInfo::getRegPressureSetLimit() 554 Policy.ShouldTrackPressure = true; 555 556 // Enabling both top down and bottom up scheduling seems to give us less 557 // register spills than just using one of these approaches on its own. 558 Policy.OnlyTopDown = false; 559 Policy.OnlyBottomUp = false; 560 561 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 562 if (!enableSIScheduler()) 563 Policy.ShouldTrackLaneMasks = true; 564 } 565 566 bool GCNSubtarget::hasMadF16() const { 567 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 568 } 569 570 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 571 if (getGeneration() >= AMDGPUSubtarget::GFX10) 572 return getMaxWavesPerEU(); 573 574 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 575 if (SGPRs <= 80) 576 return 10; 577 if (SGPRs <= 88) 578 return 9; 579 if (SGPRs <= 100) 580 return 8; 581 return 7; 582 } 583 if (SGPRs <= 48) 584 return 10; 585 if (SGPRs <= 56) 586 return 9; 587 if (SGPRs <= 64) 588 return 8; 589 if (SGPRs <= 72) 590 return 7; 591 if (SGPRs <= 80) 592 return 6; 593 return 5; 594 } 595 596 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 597 unsigned MaxWaves = getMaxWavesPerEU(); 598 unsigned Granule = getVGPRAllocGranule(); 599 if (VGPRs < Granule) 600 return MaxWaves; 601 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 602 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 603 } 604 605 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 606 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 607 if (getGeneration() >= AMDGPUSubtarget::GFX10) 608 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 609 610 if (MFI.hasFlatScratchInit()) { 611 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 612 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 613 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 614 return 4; // FLAT_SCRATCH, VCC (in that order). 615 } 616 617 if (isXNACKEnabled()) 618 return 4; // XNACK, VCC (in that order). 619 return 2; // VCC. 620 } 621 622 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 623 unsigned LDSSize, 624 unsigned NumSGPRs, 625 unsigned NumVGPRs) const { 626 unsigned Occupancy = 627 std::min(getMaxWavesPerEU(), 628 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 629 if (NumSGPRs) 630 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 631 if (NumVGPRs) 632 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 633 return Occupancy; 634 } 635 636 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 637 const Function &F = MF.getFunction(); 638 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 639 640 // Compute maximum number of SGPRs function can use using default/requested 641 // minimum number of waves per execution unit. 642 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 643 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 644 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 645 646 // Check if maximum number of SGPRs was explicitly requested using 647 // "amdgpu-num-sgpr" attribute. 648 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 649 unsigned Requested = AMDGPU::getIntegerAttribute( 650 F, "amdgpu-num-sgpr", MaxNumSGPRs); 651 652 // Make sure requested value does not violate subtarget's specifications. 653 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 654 Requested = 0; 655 656 // If more SGPRs are required to support the input user/system SGPRs, 657 // increase to accommodate them. 658 // 659 // FIXME: This really ends up using the requested number of SGPRs + number 660 // of reserved special registers in total. Theoretically you could re-use 661 // the last input registers for these special registers, but this would 662 // require a lot of complexity to deal with the weird aliasing. 663 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 664 if (Requested && Requested < InputNumSGPRs) 665 Requested = InputNumSGPRs; 666 667 // Make sure requested value is compatible with values implied by 668 // default/requested minimum/maximum number of waves per execution unit. 669 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 670 Requested = 0; 671 if (WavesPerEU.second && 672 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 673 Requested = 0; 674 675 if (Requested) 676 MaxNumSGPRs = Requested; 677 } 678 679 if (hasSGPRInitBug()) 680 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 681 682 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 683 MaxAddressableNumSGPRs); 684 } 685 686 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 687 const Function &F = MF.getFunction(); 688 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 689 690 // Compute maximum number of VGPRs function can use using default/requested 691 // minimum number of waves per execution unit. 692 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 693 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 694 695 // Check if maximum number of VGPRs was explicitly requested using 696 // "amdgpu-num-vgpr" attribute. 697 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 698 unsigned Requested = AMDGPU::getIntegerAttribute( 699 F, "amdgpu-num-vgpr", MaxNumVGPRs); 700 701 // Make sure requested value is compatible with values implied by 702 // default/requested minimum/maximum number of waves per execution unit. 703 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 704 Requested = 0; 705 if (WavesPerEU.second && 706 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 707 Requested = 0; 708 709 if (Requested) 710 MaxNumVGPRs = Requested; 711 } 712 713 return MaxNumVGPRs; 714 } 715 716 namespace { 717 struct MemOpClusterMutation : ScheduleDAGMutation { 718 const SIInstrInfo *TII; 719 720 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 721 722 void apply(ScheduleDAGInstrs *DAG) override { 723 SUnit *SUa = nullptr; 724 // Search for two consequent memory operations and link them 725 // to prevent scheduler from moving them apart. 726 // In DAG pre-process SUnits are in the original order of 727 // the instructions before scheduling. 728 for (SUnit &SU : DAG->SUnits) { 729 MachineInstr &MI2 = *SU.getInstr(); 730 if (!MI2.mayLoad() && !MI2.mayStore()) { 731 SUa = nullptr; 732 continue; 733 } 734 if (!SUa) { 735 SUa = &SU; 736 continue; 737 } 738 739 MachineInstr &MI1 = *SUa->getInstr(); 740 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 741 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 742 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 743 (TII->isDS(MI1) && TII->isDS(MI2))) { 744 SU.addPredBarrier(SUa); 745 746 for (const SDep &SI : SU.Preds) { 747 if (SI.getSUnit() != SUa) 748 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 749 } 750 751 if (&SU != &DAG->ExitSU) { 752 for (const SDep &SI : SUa->Succs) { 753 if (SI.getSUnit() != &SU) 754 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 755 } 756 } 757 } 758 759 SUa = &SU; 760 } 761 } 762 }; 763 764 struct FillMFMAShadowMutation : ScheduleDAGMutation { 765 const SIInstrInfo *TII; 766 767 ScheduleDAGMI *DAG; 768 769 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 770 771 bool isSALU(const SUnit *SU) const { 772 const MachineInstr *MI = SU->getInstr(); 773 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 774 } 775 776 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 777 if (Pred->NodeNum < Succ->NodeNum) 778 return true; 779 780 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 781 782 for (unsigned I = 0; I < Succs.size(); ++I) { 783 for (const SDep &SI : Succs[I]->Succs) { 784 const SUnit *SU = SI.getSUnit(); 785 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 786 Succs.push_back(SU); 787 } 788 } 789 790 SmallPtrSet<const SUnit*, 32> Visited; 791 while (!Preds.empty()) { 792 const SUnit *SU = Preds.pop_back_val(); 793 if (llvm::find(Succs, SU) != Succs.end()) 794 return false; 795 Visited.insert(SU); 796 for (const SDep &SI : SU->Preds) 797 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 798 Preds.push_back(SI.getSUnit()); 799 } 800 801 return true; 802 } 803 804 // Link as much SALU intructions in chain as possible. Return the size 805 // of the chain. Links up to MaxChain instructions. 806 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 807 SmallPtrSetImpl<SUnit *> &Visited) const { 808 SmallVector<SUnit *, 8> Worklist({To}); 809 unsigned Linked = 0; 810 811 while (!Worklist.empty() && MaxChain-- > 0) { 812 SUnit *SU = Worklist.pop_back_val(); 813 if (!Visited.insert(SU).second) 814 continue; 815 816 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 817 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 818 819 if (SU->addPred(SDep(From, SDep::Artificial), false)) 820 ++Linked; 821 822 for (SDep &SI : From->Succs) { 823 SUnit *SUv = SI.getSUnit(); 824 if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU)) 825 SUv->addPred(SDep(SU, SDep::Artificial), false); 826 } 827 828 for (SDep &SI : SU->Succs) { 829 SUnit *Succ = SI.getSUnit(); 830 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 831 Worklist.push_back(Succ); 832 } 833 } 834 835 return Linked; 836 } 837 838 void apply(ScheduleDAGInstrs *DAGInstrs) override { 839 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 840 if (!ST.hasMAIInsts() || DisablePowerSched) 841 return; 842 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 843 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 844 if (!TSchedModel || DAG->SUnits.empty()) 845 return; 846 847 // Scan for MFMA long latency instructions and try to add a dependency 848 // of available SALU instructions to give them a chance to fill MFMA 849 // shadow. That is desirable to fill MFMA shadow with SALU instructions 850 // rather than VALU to prevent power consumption bursts and throttle. 851 auto LastSALU = DAG->SUnits.begin(); 852 auto E = DAG->SUnits.end(); 853 SmallPtrSet<SUnit*, 32> Visited; 854 for (SUnit &SU : DAG->SUnits) { 855 MachineInstr &MAI = *SU.getInstr(); 856 if (!TII->isMAI(MAI) || 857 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 858 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 859 continue; 860 861 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 862 863 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 864 dbgs() << "Need " << Lat 865 << " instructions to cover latency.\n"); 866 867 // Find up to Lat independent scalar instructions as early as 868 // possible such that they can be scheduled after this MFMA. 869 for ( ; Lat && LastSALU != E; ++LastSALU) { 870 if (Visited.count(&*LastSALU)) 871 continue; 872 873 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 874 continue; 875 876 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 877 } 878 } 879 } 880 }; 881 } // namespace 882 883 void GCNSubtarget::getPostRAMutations( 884 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 885 Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo)); 886 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 887 } 888 889 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 890 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 891 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 892 else 893 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 894 } 895 896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 897 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 898 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 899 else 900 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 901 } 902