1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_insensitive("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_insensitive("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_insensitive("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 167 168 TargetID.setTargetIDFromFeaturesString(FS); 169 170 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 171 << TargetID.getXnackSetting() << '\n'); 172 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 173 << TargetID.getSramEccSetting() << '\n'); 174 175 return *this; 176 } 177 178 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 179 TargetTriple(TT), 180 GCN3Encoding(false), 181 Has16BitInsts(false), 182 HasMadMixInsts(false), 183 HasMadMacF32Insts(false), 184 HasDsSrc2Insts(false), 185 HasSDWA(false), 186 HasVOP3PInsts(false), 187 HasMulI24(true), 188 HasMulU24(true), 189 HasSMulHi(false), 190 HasInv2PiInlineImm(false), 191 HasFminFmaxLegacy(true), 192 EnablePromoteAlloca(false), 193 HasTrigReducedRange(false), 194 MaxWavesPerEU(10), 195 LocalMemorySize(0), 196 WavefrontSizeLog2(0) 197 { } 198 199 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 200 const GCNTargetMachine &TM) 201 : // clang-format off 202 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 203 AMDGPUSubtarget(TT), 204 TargetTriple(TT), 205 TargetID(*this), 206 Gen(INVALID), 207 InstrItins(getInstrItineraryForCPU(GPU)), 208 LDSBankCount(0), 209 MaxPrivateElementSize(0), 210 211 FastFMAF32(false), 212 FastDenormalF32(false), 213 HalfRate64Ops(false), 214 FullRate64Ops(false), 215 216 FlatForGlobal(false), 217 AutoWaitcntBeforeBarrier(false), 218 UnalignedScratchAccess(false), 219 UnalignedAccessMode(false), 220 221 HasApertureRegs(false), 222 SupportsXNACK(false), 223 EnableXNACK(false), 224 EnableTgSplit(false), 225 EnableCuMode(false), 226 TrapHandler(false), 227 228 EnableLoadStoreOpt(false), 229 EnableUnsafeDSOffsetFolding(false), 230 EnableSIScheduler(false), 231 EnableDS128(false), 232 EnablePRTStrictNull(false), 233 DumpCode(false), 234 235 FP64(false), 236 CIInsts(false), 237 GFX8Insts(false), 238 GFX9Insts(false), 239 GFX90AInsts(false), 240 GFX10Insts(false), 241 GFX10_3Insts(false), 242 GFX7GFX8GFX9Insts(false), 243 SGPRInitBug(false), 244 NegativeScratchOffsetBug(false), 245 NegativeUnalignedScratchOffsetBug(false), 246 HasSMemRealTime(false), 247 HasIntClamp(false), 248 HasFmaMixInsts(false), 249 HasMovrel(false), 250 HasVGPRIndexMode(false), 251 HasScalarStores(false), 252 HasScalarAtomics(false), 253 HasSDWAOmod(false), 254 HasSDWAScalar(false), 255 HasSDWASdst(false), 256 HasSDWAMac(false), 257 HasSDWAOutModsVOPC(false), 258 HasDPP(false), 259 HasDPP8(false), 260 Has64BitDPP(false), 261 HasPackedFP32Ops(false), 262 HasExtendedImageInsts(false), 263 HasR128A16(false), 264 HasGFX10A16(false), 265 HasG16(false), 266 HasNSAEncoding(false), 267 NSAMaxSize(0), 268 GFX10_AEncoding(false), 269 GFX10_BEncoding(false), 270 HasDLInsts(false), 271 HasDot1Insts(false), 272 HasDot2Insts(false), 273 HasDot3Insts(false), 274 HasDot4Insts(false), 275 HasDot5Insts(false), 276 HasDot6Insts(false), 277 HasDot7Insts(false), 278 HasMAIInsts(false), 279 HasPkFmacF16Inst(false), 280 HasAtomicFaddInsts(false), 281 SupportsSRAMECC(false), 282 EnableSRAMECC(false), 283 HasNoSdstCMPX(false), 284 HasVscnt(false), 285 HasGetWaveIdInst(false), 286 HasSMemTimeInst(false), 287 HasShaderCyclesRegister(false), 288 HasRegisterBanking(false), 289 HasVOP3Literal(false), 290 HasNoDataDepHazard(false), 291 FlatAddressSpace(false), 292 FlatInstOffsets(false), 293 FlatGlobalInsts(false), 294 FlatScratchInsts(false), 295 ScalarFlatScratchInsts(false), 296 HasArchitectedFlatScratch(false), 297 AddNoCarryInsts(false), 298 HasUnpackedD16VMem(false), 299 LDSMisalignedBug(false), 300 HasMFMAInlineLiteralBug(false), 301 UnalignedBufferAccess(false), 302 UnalignedDSAccess(false), 303 HasPackedTID(false), 304 305 ScalarizeGlobal(false), 306 307 HasVcmpxPermlaneHazard(false), 308 HasVMEMtoScalarWriteHazard(false), 309 HasSMEMtoVectorWriteHazard(false), 310 HasInstFwdPrefetchBug(false), 311 HasVcmpxExecWARHazard(false), 312 HasLdsBranchVmemWARHazard(false), 313 HasNSAtoVMEMBug(false), 314 HasNSAClauseBug(false), 315 HasOffset3fBug(false), 316 HasFlatSegmentOffsetBug(false), 317 HasImageStoreD16Bug(false), 318 HasImageGather4D16Bug(false), 319 320 FeatureDisable(false), 321 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 322 TLInfo(TM, *this), 323 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 324 // clang-format on 325 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 326 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 327 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 328 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 329 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 330 InstSelector.reset(new AMDGPUInstructionSelector( 331 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 332 } 333 334 bool GCNSubtarget::enableFlatScratch() const { 335 return flatScratchIsArchitected() || 336 (EnableFlatScratch && hasFlatScratchInsts()); 337 } 338 339 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 340 if (getGeneration() < GFX10) 341 return 1; 342 343 switch (Opcode) { 344 case AMDGPU::V_LSHLREV_B64_e64: 345 case AMDGPU::V_LSHLREV_B64_gfx10: 346 case AMDGPU::V_LSHL_B64_e64: 347 case AMDGPU::V_LSHRREV_B64_e64: 348 case AMDGPU::V_LSHRREV_B64_gfx10: 349 case AMDGPU::V_LSHR_B64_e64: 350 case AMDGPU::V_ASHRREV_I64_e64: 351 case AMDGPU::V_ASHRREV_I64_gfx10: 352 case AMDGPU::V_ASHR_I64_e64: 353 return 1; 354 } 355 356 return 2; 357 } 358 359 /// This list was mostly derived from experimentation. 360 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 361 switch (Opcode) { 362 case AMDGPU::V_CVT_F16_F32_e32: 363 case AMDGPU::V_CVT_F16_F32_e64: 364 case AMDGPU::V_CVT_F16_U16_e32: 365 case AMDGPU::V_CVT_F16_U16_e64: 366 case AMDGPU::V_CVT_F16_I16_e32: 367 case AMDGPU::V_CVT_F16_I16_e64: 368 case AMDGPU::V_RCP_F16_e64: 369 case AMDGPU::V_RCP_F16_e32: 370 case AMDGPU::V_RSQ_F16_e64: 371 case AMDGPU::V_RSQ_F16_e32: 372 case AMDGPU::V_SQRT_F16_e64: 373 case AMDGPU::V_SQRT_F16_e32: 374 case AMDGPU::V_LOG_F16_e64: 375 case AMDGPU::V_LOG_F16_e32: 376 case AMDGPU::V_EXP_F16_e64: 377 case AMDGPU::V_EXP_F16_e32: 378 case AMDGPU::V_SIN_F16_e64: 379 case AMDGPU::V_SIN_F16_e32: 380 case AMDGPU::V_COS_F16_e64: 381 case AMDGPU::V_COS_F16_e32: 382 case AMDGPU::V_FLOOR_F16_e64: 383 case AMDGPU::V_FLOOR_F16_e32: 384 case AMDGPU::V_CEIL_F16_e64: 385 case AMDGPU::V_CEIL_F16_e32: 386 case AMDGPU::V_TRUNC_F16_e64: 387 case AMDGPU::V_TRUNC_F16_e32: 388 case AMDGPU::V_RNDNE_F16_e64: 389 case AMDGPU::V_RNDNE_F16_e32: 390 case AMDGPU::V_FRACT_F16_e64: 391 case AMDGPU::V_FRACT_F16_e32: 392 case AMDGPU::V_FREXP_MANT_F16_e64: 393 case AMDGPU::V_FREXP_MANT_F16_e32: 394 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 395 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 396 case AMDGPU::V_LDEXP_F16_e64: 397 case AMDGPU::V_LDEXP_F16_e32: 398 case AMDGPU::V_LSHLREV_B16_e64: 399 case AMDGPU::V_LSHLREV_B16_e32: 400 case AMDGPU::V_LSHRREV_B16_e64: 401 case AMDGPU::V_LSHRREV_B16_e32: 402 case AMDGPU::V_ASHRREV_I16_e64: 403 case AMDGPU::V_ASHRREV_I16_e32: 404 case AMDGPU::V_ADD_U16_e64: 405 case AMDGPU::V_ADD_U16_e32: 406 case AMDGPU::V_SUB_U16_e64: 407 case AMDGPU::V_SUB_U16_e32: 408 case AMDGPU::V_SUBREV_U16_e64: 409 case AMDGPU::V_SUBREV_U16_e32: 410 case AMDGPU::V_MUL_LO_U16_e64: 411 case AMDGPU::V_MUL_LO_U16_e32: 412 case AMDGPU::V_ADD_F16_e64: 413 case AMDGPU::V_ADD_F16_e32: 414 case AMDGPU::V_SUB_F16_e64: 415 case AMDGPU::V_SUB_F16_e32: 416 case AMDGPU::V_SUBREV_F16_e64: 417 case AMDGPU::V_SUBREV_F16_e32: 418 case AMDGPU::V_MUL_F16_e64: 419 case AMDGPU::V_MUL_F16_e32: 420 case AMDGPU::V_MAX_F16_e64: 421 case AMDGPU::V_MAX_F16_e32: 422 case AMDGPU::V_MIN_F16_e64: 423 case AMDGPU::V_MIN_F16_e32: 424 case AMDGPU::V_MAX_U16_e64: 425 case AMDGPU::V_MAX_U16_e32: 426 case AMDGPU::V_MIN_U16_e64: 427 case AMDGPU::V_MIN_U16_e32: 428 case AMDGPU::V_MAX_I16_e64: 429 case AMDGPU::V_MAX_I16_e32: 430 case AMDGPU::V_MIN_I16_e64: 431 case AMDGPU::V_MIN_I16_e32: 432 // On gfx10, all 16-bit instructions preserve the high bits. 433 return getGeneration() <= AMDGPUSubtarget::GFX9; 434 case AMDGPU::V_MAD_F16_e64: 435 case AMDGPU::V_MADAK_F16: 436 case AMDGPU::V_MADMK_F16: 437 case AMDGPU::V_MAC_F16_e64: 438 case AMDGPU::V_MAC_F16_e32: 439 case AMDGPU::V_FMAMK_F16: 440 case AMDGPU::V_FMAAK_F16: 441 case AMDGPU::V_MAD_U16_e64: 442 case AMDGPU::V_MAD_I16_e64: 443 case AMDGPU::V_FMA_F16_e64: 444 case AMDGPU::V_FMAC_F16_e64: 445 case AMDGPU::V_FMAC_F16_e32: 446 case AMDGPU::V_DIV_FIXUP_F16_e64: 447 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 448 // instructions maintain the legacy behavior of 0ing. Some instructions 449 // changed to preserving the high bits. 450 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 451 case AMDGPU::V_MAD_MIXLO_F16: 452 case AMDGPU::V_MAD_MIXHI_F16: 453 default: 454 return false; 455 } 456 } 457 458 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 459 const Function &F) const { 460 if (NWaves == 1) 461 return getLocalMemorySize(); 462 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 463 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 464 if (!WorkGroupsPerCu) 465 return 0; 466 unsigned MaxWaves = getMaxWavesPerEU(); 467 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 468 } 469 470 // FIXME: Should return min,max range. 471 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 472 const Function &F) const { 473 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 474 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 475 if (!MaxWorkGroupsPerCu) 476 return 0; 477 478 const unsigned WaveSize = getWavefrontSize(); 479 480 // FIXME: Do we need to account for alignment requirement of LDS rounding the 481 // size up? 482 // Compute restriction based on LDS usage 483 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 484 485 // This can be queried with more LDS than is possible, so just assume the 486 // worst. 487 if (NumGroups == 0) 488 return 1; 489 490 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 491 492 // Round to the number of waves. 493 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 494 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 495 496 // Clamp to the maximum possible number of waves. 497 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 498 499 // FIXME: Needs to be a multiple of the group size? 500 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 501 502 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 503 "computed invalid occupancy"); 504 return MaxWaves; 505 } 506 507 unsigned 508 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 509 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 510 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 511 } 512 513 std::pair<unsigned, unsigned> 514 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 515 switch (CC) { 516 case CallingConv::AMDGPU_VS: 517 case CallingConv::AMDGPU_LS: 518 case CallingConv::AMDGPU_HS: 519 case CallingConv::AMDGPU_ES: 520 case CallingConv::AMDGPU_GS: 521 case CallingConv::AMDGPU_PS: 522 return std::make_pair(1, getWavefrontSize()); 523 default: 524 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 525 } 526 } 527 528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 529 const Function &F) const { 530 // Default minimum/maximum flat work group sizes. 531 std::pair<unsigned, unsigned> Default = 532 getDefaultFlatWorkGroupSize(F.getCallingConv()); 533 534 // Requested minimum/maximum flat work group sizes. 535 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 536 F, "amdgpu-flat-work-group-size", Default); 537 538 // Make sure requested minimum is less than requested maximum. 539 if (Requested.first > Requested.second) 540 return Default; 541 542 // Make sure requested values do not violate subtarget's specifications. 543 if (Requested.first < getMinFlatWorkGroupSize()) 544 return Default; 545 if (Requested.second > getMaxFlatWorkGroupSize()) 546 return Default; 547 548 return Requested; 549 } 550 551 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 552 const Function &F) const { 553 // Default minimum/maximum number of waves per execution unit. 554 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 555 556 // Default/requested minimum/maximum flat work group sizes. 557 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 558 559 // If minimum/maximum flat work group sizes were explicitly requested using 560 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 561 // number of waves per execution unit to values implied by requested 562 // minimum/maximum flat work group sizes. 563 unsigned MinImpliedByFlatWorkGroupSize = 564 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 565 Default.first = MinImpliedByFlatWorkGroupSize; 566 bool RequestedFlatWorkGroupSize = 567 F.hasFnAttribute("amdgpu-flat-work-group-size"); 568 569 // Requested minimum/maximum number of waves per execution unit. 570 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 571 F, "amdgpu-waves-per-eu", Default, true); 572 573 // Make sure requested minimum is less than requested maximum. 574 if (Requested.second && Requested.first > Requested.second) 575 return Default; 576 577 // Make sure requested values do not violate subtarget's specifications. 578 if (Requested.first < getMinWavesPerEU() || 579 Requested.second > getMaxWavesPerEU()) 580 return Default; 581 582 // Make sure requested values are compatible with values implied by requested 583 // minimum/maximum flat work group sizes. 584 if (RequestedFlatWorkGroupSize && 585 Requested.first < MinImpliedByFlatWorkGroupSize) 586 return Default; 587 588 return Requested; 589 } 590 591 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 592 auto Node = Kernel.getMetadata("reqd_work_group_size"); 593 if (Node && Node->getNumOperands() == 3) 594 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 595 return std::numeric_limits<unsigned>::max(); 596 } 597 598 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 599 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 600 } 601 602 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 603 unsigned Dimension) const { 604 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 605 if (ReqdSize != std::numeric_limits<unsigned>::max()) 606 return ReqdSize - 1; 607 return getFlatWorkGroupSizes(Kernel).second - 1; 608 } 609 610 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 611 Function *Kernel = I->getParent()->getParent(); 612 unsigned MinSize = 0; 613 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 614 bool IdQuery = false; 615 616 // If reqd_work_group_size is present it narrows value down. 617 if (auto *CI = dyn_cast<CallInst>(I)) { 618 const Function *F = CI->getCalledFunction(); 619 if (F) { 620 unsigned Dim = UINT_MAX; 621 switch (F->getIntrinsicID()) { 622 case Intrinsic::amdgcn_workitem_id_x: 623 case Intrinsic::r600_read_tidig_x: 624 IdQuery = true; 625 LLVM_FALLTHROUGH; 626 case Intrinsic::r600_read_local_size_x: 627 Dim = 0; 628 break; 629 case Intrinsic::amdgcn_workitem_id_y: 630 case Intrinsic::r600_read_tidig_y: 631 IdQuery = true; 632 LLVM_FALLTHROUGH; 633 case Intrinsic::r600_read_local_size_y: 634 Dim = 1; 635 break; 636 case Intrinsic::amdgcn_workitem_id_z: 637 case Intrinsic::r600_read_tidig_z: 638 IdQuery = true; 639 LLVM_FALLTHROUGH; 640 case Intrinsic::r600_read_local_size_z: 641 Dim = 2; 642 break; 643 default: 644 break; 645 } 646 647 if (Dim <= 3) { 648 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 649 if (ReqdSize != std::numeric_limits<unsigned>::max()) 650 MinSize = MaxSize = ReqdSize; 651 } 652 } 653 } 654 655 if (!MaxSize) 656 return false; 657 658 // Range metadata is [Lo, Hi). For ID query we need to pass max size 659 // as Hi. For size query we need to pass Hi + 1. 660 if (IdQuery) 661 MinSize = 0; 662 else 663 ++MaxSize; 664 665 MDBuilder MDB(I->getContext()); 666 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 667 APInt(32, MaxSize)); 668 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 669 return true; 670 } 671 672 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 673 if (isMesaKernel(F)) 674 return 16; 675 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 676 } 677 678 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 679 Align &MaxAlign) const { 680 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 681 F.getCallingConv() == CallingConv::SPIR_KERNEL); 682 683 const DataLayout &DL = F.getParent()->getDataLayout(); 684 uint64_t ExplicitArgBytes = 0; 685 MaxAlign = Align(1); 686 687 for (const Argument &Arg : F.args()) { 688 const bool IsByRef = Arg.hasByRefAttr(); 689 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 690 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 691 if (!Alignment) 692 Alignment = DL.getABITypeAlign(ArgTy); 693 694 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 695 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 696 MaxAlign = max(MaxAlign, Alignment); 697 } 698 699 return ExplicitArgBytes; 700 } 701 702 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 703 Align &MaxAlign) const { 704 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 705 706 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 707 708 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 709 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 710 if (ImplicitBytes != 0) { 711 const Align Alignment = getAlignmentForImplicitArgPtr(); 712 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 713 } 714 715 // Being able to dereference past the end is useful for emitting scalar loads. 716 return alignTo(TotalSize, 4); 717 } 718 719 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 720 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 721 : AMDGPUDwarfFlavour::Wave64; 722 } 723 724 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 725 const TargetMachine &TM) : 726 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 727 AMDGPUSubtarget(TT), 728 InstrInfo(*this), 729 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 730 FMA(false), 731 CaymanISA(false), 732 CFALUBug(false), 733 HasVertexCache(false), 734 R600ALUInst(false), 735 FP64(false), 736 TexVTXClauseSize(0), 737 Gen(R600), 738 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 739 InstrItins(getInstrItineraryForCPU(GPU)) { } 740 741 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 742 unsigned NumRegionInstrs) const { 743 // Track register pressure so the scheduler can try to decrease 744 // pressure once register usage is above the threshold defined by 745 // SIRegisterInfo::getRegPressureSetLimit() 746 Policy.ShouldTrackPressure = true; 747 748 // Enabling both top down and bottom up scheduling seems to give us less 749 // register spills than just using one of these approaches on its own. 750 Policy.OnlyTopDown = false; 751 Policy.OnlyBottomUp = false; 752 753 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 754 if (!enableSIScheduler()) 755 Policy.ShouldTrackLaneMasks = true; 756 } 757 758 bool GCNSubtarget::hasMadF16() const { 759 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 760 } 761 762 bool GCNSubtarget::useVGPRIndexMode() const { 763 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 764 } 765 766 bool GCNSubtarget::useAA() const { return UseAA; } 767 768 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 769 if (getGeneration() >= AMDGPUSubtarget::GFX10) 770 return getMaxWavesPerEU(); 771 772 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 773 if (SGPRs <= 80) 774 return 10; 775 if (SGPRs <= 88) 776 return 9; 777 if (SGPRs <= 100) 778 return 8; 779 return 7; 780 } 781 if (SGPRs <= 48) 782 return 10; 783 if (SGPRs <= 56) 784 return 9; 785 if (SGPRs <= 64) 786 return 8; 787 if (SGPRs <= 72) 788 return 7; 789 if (SGPRs <= 80) 790 return 6; 791 return 5; 792 } 793 794 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 795 unsigned MaxWaves = getMaxWavesPerEU(); 796 unsigned Granule = getVGPRAllocGranule(); 797 if (VGPRs < Granule) 798 return MaxWaves; 799 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 800 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 801 } 802 803 unsigned 804 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { 805 if (getGeneration() >= AMDGPUSubtarget::GFX10) 806 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 807 808 if (HasFlatScratchInit) { 809 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 810 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 811 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 812 return 4; // FLAT_SCRATCH, VCC (in that order). 813 } 814 815 if (isXNACKEnabled()) 816 return 4; // XNACK, VCC (in that order). 817 return 2; // VCC. 818 } 819 820 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 821 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 822 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 823 } 824 825 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 826 // The logic to detect if the function has 827 // flat scratch init is slightly different than how 828 // SIMachineFunctionInfo constructor derives. 829 // We don't use amdgpu-calls, amdgpu-stack-objects 830 // attributes and isAmdHsaOrMesa here as it doesn't really matter. 831 // TODO: Outline this derivation logic and have just 832 // one common function in the backend to avoid duplication. 833 bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv()); 834 bool FunctionHasFlatScratchInit = false; 835 if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() && 836 enableFlatScratch()) { 837 FunctionHasFlatScratchInit = true; 838 } 839 return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); 840 } 841 842 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 843 unsigned NumSGPRs, 844 unsigned NumVGPRs) const { 845 unsigned Occupancy = 846 std::min(getMaxWavesPerEU(), 847 getOccupancyWithLocalMemSize(LDSSize, F)); 848 if (NumSGPRs) 849 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 850 if (NumVGPRs) 851 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 852 return Occupancy; 853 } 854 855 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 856 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 857 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 858 // Compute maximum number of SGPRs function can use using default/requested 859 // minimum number of waves per execution unit. 860 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 861 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 862 863 // Check if maximum number of SGPRs was explicitly requested using 864 // "amdgpu-num-sgpr" attribute. 865 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 866 unsigned Requested = AMDGPU::getIntegerAttribute( 867 F, "amdgpu-num-sgpr", MaxNumSGPRs); 868 869 // Make sure requested value does not violate subtarget's specifications. 870 if (Requested && (Requested <= ReservedNumSGPRs)) 871 Requested = 0; 872 873 // If more SGPRs are required to support the input user/system SGPRs, 874 // increase to accommodate them. 875 // 876 // FIXME: This really ends up using the requested number of SGPRs + number 877 // of reserved special registers in total. Theoretically you could re-use 878 // the last input registers for these special registers, but this would 879 // require a lot of complexity to deal with the weird aliasing. 880 unsigned InputNumSGPRs = PreloadedSGPRs; 881 if (Requested && Requested < InputNumSGPRs) 882 Requested = InputNumSGPRs; 883 884 // Make sure requested value is compatible with values implied by 885 // default/requested minimum/maximum number of waves per execution unit. 886 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 887 Requested = 0; 888 if (WavesPerEU.second && 889 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 890 Requested = 0; 891 892 if (Requested) 893 MaxNumSGPRs = Requested; 894 } 895 896 if (hasSGPRInitBug()) 897 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 898 899 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 900 } 901 902 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 903 const Function &F = MF.getFunction(); 904 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 905 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 906 getReservedNumSGPRs(MF)); 907 } 908 909 static unsigned getMaxNumPreloadedSGPRs() { 910 // Max number of user SGPRs 911 unsigned MaxUserSGPRs = 4 + // private segment buffer 912 2 + // Dispatch ptr 913 2 + // queue ptr 914 2 + // kernel segment ptr 915 2 + // dispatch ID 916 2 + // flat scratch init 917 2; // Implicit buffer ptr 918 // Max number of system SGPRs 919 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 920 1 + // WorkGroupIDY 921 1 + // WorkGroupIDZ 922 1 + // WorkGroupInfo 923 1; // private segment wave byte offset 924 return MaxUserSGPRs + MaxSystemSGPRs; 925 } 926 927 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 928 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 929 getReservedNumSGPRs(F)); 930 } 931 932 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 933 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 934 // Compute maximum number of VGPRs function can use using default/requested 935 // minimum number of waves per execution unit. 936 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 937 938 // Check if maximum number of VGPRs was explicitly requested using 939 // "amdgpu-num-vgpr" attribute. 940 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 941 unsigned Requested = AMDGPU::getIntegerAttribute( 942 F, "amdgpu-num-vgpr", MaxNumVGPRs); 943 944 if (hasGFX90AInsts()) 945 Requested *= 2; 946 947 // Make sure requested value is compatible with values implied by 948 // default/requested minimum/maximum number of waves per execution unit. 949 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 950 Requested = 0; 951 if (WavesPerEU.second && 952 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 953 Requested = 0; 954 955 if (Requested) 956 MaxNumVGPRs = Requested; 957 } 958 959 return MaxNumVGPRs; 960 } 961 962 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 963 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 964 } 965 966 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 967 const Function &F = MF.getFunction(); 968 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 969 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 970 } 971 972 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 973 int UseOpIdx, SDep &Dep) const { 974 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 975 !Def->isInstr() || !Use->isInstr()) 976 return; 977 978 MachineInstr *DefI = Def->getInstr(); 979 MachineInstr *UseI = Use->getInstr(); 980 981 if (DefI->isBundle()) { 982 const SIRegisterInfo *TRI = getRegisterInfo(); 983 auto Reg = Dep.getReg(); 984 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 985 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 986 unsigned Lat = 0; 987 for (++I; I != E && I->isBundledWithPred(); ++I) { 988 if (I->modifiesRegister(Reg, TRI)) 989 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 990 else if (Lat) 991 --Lat; 992 } 993 Dep.setLatency(Lat); 994 } else if (UseI->isBundle()) { 995 const SIRegisterInfo *TRI = getRegisterInfo(); 996 auto Reg = Dep.getReg(); 997 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 998 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 999 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 1000 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 1001 if (I->readsRegister(Reg, TRI)) 1002 break; 1003 --Lat; 1004 } 1005 Dep.setLatency(Lat); 1006 } 1007 } 1008 1009 namespace { 1010 struct FillMFMAShadowMutation : ScheduleDAGMutation { 1011 const SIInstrInfo *TII; 1012 1013 ScheduleDAGMI *DAG; 1014 1015 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 1016 1017 bool isSALU(const SUnit *SU) const { 1018 const MachineInstr *MI = SU->getInstr(); 1019 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 1020 } 1021 1022 bool isVALU(const SUnit *SU) const { 1023 const MachineInstr *MI = SU->getInstr(); 1024 return MI && TII->isVALU(*MI); 1025 } 1026 1027 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 1028 if (Pred->NodeNum < Succ->NodeNum) 1029 return true; 1030 1031 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 1032 1033 for (unsigned I = 0; I < Succs.size(); ++I) { 1034 for (const SDep &SI : Succs[I]->Succs) { 1035 const SUnit *SU = SI.getSUnit(); 1036 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 1037 Succs.push_back(SU); 1038 } 1039 } 1040 1041 SmallPtrSet<const SUnit*, 32> Visited; 1042 while (!Preds.empty()) { 1043 const SUnit *SU = Preds.pop_back_val(); 1044 if (llvm::is_contained(Succs, SU)) 1045 return false; 1046 Visited.insert(SU); 1047 for (const SDep &SI : SU->Preds) 1048 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1049 Preds.push_back(SI.getSUnit()); 1050 } 1051 1052 return true; 1053 } 1054 1055 // Link as much SALU intructions in chain as possible. Return the size 1056 // of the chain. Links up to MaxChain instructions. 1057 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1058 SmallPtrSetImpl<SUnit *> &Visited) const { 1059 SmallVector<SUnit *, 8> Worklist({To}); 1060 unsigned Linked = 0; 1061 1062 while (!Worklist.empty() && MaxChain-- > 0) { 1063 SUnit *SU = Worklist.pop_back_val(); 1064 if (!Visited.insert(SU).second) 1065 continue; 1066 1067 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1068 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1069 1070 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1071 ++Linked; 1072 1073 for (SDep &SI : From->Succs) { 1074 SUnit *SUv = SI.getSUnit(); 1075 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1076 SUv->addPred(SDep(SU, SDep::Artificial), false); 1077 } 1078 1079 for (SDep &SI : SU->Succs) { 1080 SUnit *Succ = SI.getSUnit(); 1081 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1082 Worklist.push_back(Succ); 1083 } 1084 } 1085 1086 return Linked; 1087 } 1088 1089 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1090 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1091 if (!ST.hasMAIInsts() || DisablePowerSched) 1092 return; 1093 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1094 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1095 if (!TSchedModel || DAG->SUnits.empty()) 1096 return; 1097 1098 // Scan for MFMA long latency instructions and try to add a dependency 1099 // of available SALU instructions to give them a chance to fill MFMA 1100 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1101 // rather than VALU to prevent power consumption bursts and throttle. 1102 auto LastSALU = DAG->SUnits.begin(); 1103 auto E = DAG->SUnits.end(); 1104 SmallPtrSet<SUnit*, 32> Visited; 1105 for (SUnit &SU : DAG->SUnits) { 1106 MachineInstr &MAI = *SU.getInstr(); 1107 if (!TII->isMAI(MAI) || 1108 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1109 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1110 continue; 1111 1112 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1113 1114 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1115 dbgs() << "Need " << Lat 1116 << " instructions to cover latency.\n"); 1117 1118 // Find up to Lat independent scalar instructions as early as 1119 // possible such that they can be scheduled after this MFMA. 1120 for ( ; Lat && LastSALU != E; ++LastSALU) { 1121 if (Visited.count(&*LastSALU)) 1122 continue; 1123 1124 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1125 continue; 1126 1127 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1128 } 1129 } 1130 } 1131 }; 1132 } // namespace 1133 1134 void GCNSubtarget::getPostRAMutations( 1135 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1136 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1137 } 1138 1139 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1140 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1141 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1142 else 1143 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1144 } 1145 1146 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1147 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1148 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1149 else 1150 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1151 } 1152