1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "R600Subtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #undef AMDGPUSubtarget 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> EnableFlatScratch( 54 "amdgpu-enable-flat-scratch", 55 cl::desc("Use flat scratch instructions"), 56 cl::init(false)); 57 58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 59 cl::desc("Enable the use of AA during codegen."), 60 cl::init(true)); 61 62 GCNSubtarget::~GCNSubtarget() = default; 63 64 GCNSubtarget & 65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 66 StringRef GPU, StringRef FS) { 67 // Determine default and user-specified characteristics 68 // 69 // We want to be able to turn these off, but making this a subtarget feature 70 // for SI has the unhelpful behavior that it unsets everything else if you 71 // disable it. 72 // 73 // Similarly we want enable-prt-strict-null to be on by default and not to 74 // unset everything else if it is disabled 75 76 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 77 78 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 79 if (isAmdHsaOS()) 80 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 81 82 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 83 84 // Disable mutually exclusive bits. 85 if (FS.contains_insensitive("+wavefrontsize")) { 86 if (!FS.contains_insensitive("wavefrontsize16")) 87 FullFS += "-wavefrontsize16,"; 88 if (!FS.contains_insensitive("wavefrontsize32")) 89 FullFS += "-wavefrontsize32,"; 90 if (!FS.contains_insensitive("wavefrontsize64")) 91 FullFS += "-wavefrontsize64,"; 92 } 93 94 FullFS += FS; 95 96 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 97 98 // Implement the "generic" processors, which acts as the default when no 99 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 100 // the first amdgcn target that supports flat addressing. Other OSes defaults 101 // to the first amdgcn target. 102 if (Gen == AMDGPUSubtarget::INVALID) { 103 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 104 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 105 } 106 107 // We don't support FP64 for EG/NI atm. 108 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 109 110 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 111 // support flat operations, otherwise they cannot access a 64-bit global 112 // address space 113 assert(hasAddr64() || hasFlat()); 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 115 // that do not support ADDR64 variants of MUBUF instructions. Such targets 116 // cannot use a 64 bit offset with a MUBUF instruction to access the global 117 // address space 118 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 119 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 120 FlatForGlobal = true; 121 } 122 // Unless +-flat-for-global is specified, use MUBUF instructions for global 123 // address space access if flat operations are not available. 124 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 125 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 126 FlatForGlobal = false; 127 } 128 129 // Set defaults if needed. 130 if (MaxPrivateElementSize == 0) 131 MaxPrivateElementSize = 4; 132 133 if (LDSBankCount == 0) 134 LDSBankCount = 32; 135 136 if (TT.getArch() == Triple::amdgcn) { 137 if (LocalMemorySize == 0) 138 LocalMemorySize = 32768; 139 140 // Do something sensible for unspecified target. 141 if (!HasMovrel && !HasVGPRIndexMode) 142 HasMovrel = true; 143 } 144 145 // Don't crash on invalid devices. 146 if (WavefrontSizeLog2 == 0) 147 WavefrontSizeLog2 = 5; 148 149 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 150 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 151 152 TargetID.setTargetIDFromFeaturesString(FS); 153 154 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 155 << TargetID.getXnackSetting() << '\n'); 156 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 157 << TargetID.getSramEccSetting() << '\n'); 158 159 return *this; 160 } 161 162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 163 TargetTriple(TT), 164 GCN3Encoding(false), 165 Has16BitInsts(false), 166 HasMadMixInsts(false), 167 HasMadMacF32Insts(false), 168 HasDsSrc2Insts(false), 169 HasSDWA(false), 170 HasVOP3PInsts(false), 171 HasMulI24(true), 172 HasMulU24(true), 173 HasSMulHi(false), 174 HasInv2PiInlineImm(false), 175 HasFminFmaxLegacy(true), 176 EnablePromoteAlloca(false), 177 HasTrigReducedRange(false), 178 MaxWavesPerEU(10), 179 LocalMemorySize(0), 180 WavefrontSizeLog2(0) 181 { } 182 183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 184 const GCNTargetMachine &TM) 185 : // clang-format off 186 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 187 AMDGPUSubtarget(TT), 188 TargetTriple(TT), 189 TargetID(*this), 190 Gen(INVALID), 191 InstrItins(getInstrItineraryForCPU(GPU)), 192 LDSBankCount(0), 193 MaxPrivateElementSize(0), 194 195 FastFMAF32(false), 196 FastDenormalF32(false), 197 HalfRate64Ops(false), 198 FullRate64Ops(false), 199 200 FlatForGlobal(false), 201 AutoWaitcntBeforeBarrier(false), 202 UnalignedScratchAccess(false), 203 UnalignedAccessMode(false), 204 205 HasApertureRegs(false), 206 SupportsXNACK(false), 207 EnableXNACK(false), 208 EnableTgSplit(false), 209 EnableCuMode(false), 210 TrapHandler(false), 211 212 EnableLoadStoreOpt(false), 213 EnableUnsafeDSOffsetFolding(false), 214 EnableSIScheduler(false), 215 EnableDS128(false), 216 EnablePRTStrictNull(false), 217 DumpCode(false), 218 219 FP64(false), 220 CIInsts(false), 221 GFX8Insts(false), 222 GFX9Insts(false), 223 GFX90AInsts(false), 224 GFX10Insts(false), 225 GFX10_3Insts(false), 226 GFX7GFX8GFX9Insts(false), 227 SGPRInitBug(false), 228 NegativeScratchOffsetBug(false), 229 NegativeUnalignedScratchOffsetBug(false), 230 HasSMemRealTime(false), 231 HasIntClamp(false), 232 HasFmaMixInsts(false), 233 HasMovrel(false), 234 HasVGPRIndexMode(false), 235 HasScalarStores(false), 236 HasScalarAtomics(false), 237 HasSDWAOmod(false), 238 HasSDWAScalar(false), 239 HasSDWASdst(false), 240 HasSDWAMac(false), 241 HasSDWAOutModsVOPC(false), 242 HasDPP(false), 243 HasDPP8(false), 244 Has64BitDPP(false), 245 HasPackedFP32Ops(false), 246 HasExtendedImageInsts(false), 247 HasR128A16(false), 248 HasGFX10A16(false), 249 HasG16(false), 250 HasNSAEncoding(false), 251 NSAMaxSize(0), 252 GFX10_AEncoding(false), 253 GFX10_BEncoding(false), 254 HasDLInsts(false), 255 HasDot1Insts(false), 256 HasDot2Insts(false), 257 HasDot3Insts(false), 258 HasDot4Insts(false), 259 HasDot5Insts(false), 260 HasDot6Insts(false), 261 HasDot7Insts(false), 262 HasMAIInsts(false), 263 HasPkFmacF16Inst(false), 264 HasAtomicFaddInsts(false), 265 SupportsSRAMECC(false), 266 EnableSRAMECC(false), 267 HasNoSdstCMPX(false), 268 HasVscnt(false), 269 HasGetWaveIdInst(false), 270 HasSMemTimeInst(false), 271 HasShaderCyclesRegister(false), 272 HasVOP3Literal(false), 273 HasNoDataDepHazard(false), 274 FlatAddressSpace(false), 275 FlatInstOffsets(false), 276 FlatGlobalInsts(false), 277 FlatScratchInsts(false), 278 ScalarFlatScratchInsts(false), 279 HasArchitectedFlatScratch(false), 280 AddNoCarryInsts(false), 281 HasUnpackedD16VMem(false), 282 LDSMisalignedBug(false), 283 HasMFMAInlineLiteralBug(false), 284 UnalignedBufferAccess(false), 285 UnalignedDSAccess(false), 286 HasPackedTID(false), 287 288 ScalarizeGlobal(false), 289 290 HasVcmpxPermlaneHazard(false), 291 HasVMEMtoScalarWriteHazard(false), 292 HasSMEMtoVectorWriteHazard(false), 293 HasInstFwdPrefetchBug(false), 294 HasVcmpxExecWARHazard(false), 295 HasLdsBranchVmemWARHazard(false), 296 HasNSAtoVMEMBug(false), 297 HasNSAClauseBug(false), 298 HasOffset3fBug(false), 299 HasFlatSegmentOffsetBug(false), 300 HasImageStoreD16Bug(false), 301 HasImageGather4D16Bug(false), 302 303 FeatureDisable(false), 304 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 305 TLInfo(TM, *this), 306 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 307 // clang-format on 308 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 309 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 310 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 311 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 312 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 313 InstSelector.reset(new AMDGPUInstructionSelector( 314 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 315 } 316 317 bool GCNSubtarget::enableFlatScratch() const { 318 return flatScratchIsArchitected() || 319 (EnableFlatScratch && hasFlatScratchInsts()); 320 } 321 322 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 323 if (getGeneration() < GFX10) 324 return 1; 325 326 switch (Opcode) { 327 case AMDGPU::V_LSHLREV_B64_e64: 328 case AMDGPU::V_LSHLREV_B64_gfx10: 329 case AMDGPU::V_LSHL_B64_e64: 330 case AMDGPU::V_LSHRREV_B64_e64: 331 case AMDGPU::V_LSHRREV_B64_gfx10: 332 case AMDGPU::V_LSHR_B64_e64: 333 case AMDGPU::V_ASHRREV_I64_e64: 334 case AMDGPU::V_ASHRREV_I64_gfx10: 335 case AMDGPU::V_ASHR_I64_e64: 336 return 1; 337 } 338 339 return 2; 340 } 341 342 /// This list was mostly derived from experimentation. 343 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 344 switch (Opcode) { 345 case AMDGPU::V_CVT_F16_F32_e32: 346 case AMDGPU::V_CVT_F16_F32_e64: 347 case AMDGPU::V_CVT_F16_U16_e32: 348 case AMDGPU::V_CVT_F16_U16_e64: 349 case AMDGPU::V_CVT_F16_I16_e32: 350 case AMDGPU::V_CVT_F16_I16_e64: 351 case AMDGPU::V_RCP_F16_e64: 352 case AMDGPU::V_RCP_F16_e32: 353 case AMDGPU::V_RSQ_F16_e64: 354 case AMDGPU::V_RSQ_F16_e32: 355 case AMDGPU::V_SQRT_F16_e64: 356 case AMDGPU::V_SQRT_F16_e32: 357 case AMDGPU::V_LOG_F16_e64: 358 case AMDGPU::V_LOG_F16_e32: 359 case AMDGPU::V_EXP_F16_e64: 360 case AMDGPU::V_EXP_F16_e32: 361 case AMDGPU::V_SIN_F16_e64: 362 case AMDGPU::V_SIN_F16_e32: 363 case AMDGPU::V_COS_F16_e64: 364 case AMDGPU::V_COS_F16_e32: 365 case AMDGPU::V_FLOOR_F16_e64: 366 case AMDGPU::V_FLOOR_F16_e32: 367 case AMDGPU::V_CEIL_F16_e64: 368 case AMDGPU::V_CEIL_F16_e32: 369 case AMDGPU::V_TRUNC_F16_e64: 370 case AMDGPU::V_TRUNC_F16_e32: 371 case AMDGPU::V_RNDNE_F16_e64: 372 case AMDGPU::V_RNDNE_F16_e32: 373 case AMDGPU::V_FRACT_F16_e64: 374 case AMDGPU::V_FRACT_F16_e32: 375 case AMDGPU::V_FREXP_MANT_F16_e64: 376 case AMDGPU::V_FREXP_MANT_F16_e32: 377 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 378 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 379 case AMDGPU::V_LDEXP_F16_e64: 380 case AMDGPU::V_LDEXP_F16_e32: 381 case AMDGPU::V_LSHLREV_B16_e64: 382 case AMDGPU::V_LSHLREV_B16_e32: 383 case AMDGPU::V_LSHRREV_B16_e64: 384 case AMDGPU::V_LSHRREV_B16_e32: 385 case AMDGPU::V_ASHRREV_I16_e64: 386 case AMDGPU::V_ASHRREV_I16_e32: 387 case AMDGPU::V_ADD_U16_e64: 388 case AMDGPU::V_ADD_U16_e32: 389 case AMDGPU::V_SUB_U16_e64: 390 case AMDGPU::V_SUB_U16_e32: 391 case AMDGPU::V_SUBREV_U16_e64: 392 case AMDGPU::V_SUBREV_U16_e32: 393 case AMDGPU::V_MUL_LO_U16_e64: 394 case AMDGPU::V_MUL_LO_U16_e32: 395 case AMDGPU::V_ADD_F16_e64: 396 case AMDGPU::V_ADD_F16_e32: 397 case AMDGPU::V_SUB_F16_e64: 398 case AMDGPU::V_SUB_F16_e32: 399 case AMDGPU::V_SUBREV_F16_e64: 400 case AMDGPU::V_SUBREV_F16_e32: 401 case AMDGPU::V_MUL_F16_e64: 402 case AMDGPU::V_MUL_F16_e32: 403 case AMDGPU::V_MAX_F16_e64: 404 case AMDGPU::V_MAX_F16_e32: 405 case AMDGPU::V_MIN_F16_e64: 406 case AMDGPU::V_MIN_F16_e32: 407 case AMDGPU::V_MAX_U16_e64: 408 case AMDGPU::V_MAX_U16_e32: 409 case AMDGPU::V_MIN_U16_e64: 410 case AMDGPU::V_MIN_U16_e32: 411 case AMDGPU::V_MAX_I16_e64: 412 case AMDGPU::V_MAX_I16_e32: 413 case AMDGPU::V_MIN_I16_e64: 414 case AMDGPU::V_MIN_I16_e32: 415 case AMDGPU::V_MAD_F16_e64: 416 case AMDGPU::V_MAD_U16_e64: 417 case AMDGPU::V_MAD_I16_e64: 418 case AMDGPU::V_FMA_F16_e64: 419 case AMDGPU::V_DIV_FIXUP_F16_e64: 420 // On gfx10, all 16-bit instructions preserve the high bits. 421 return getGeneration() <= AMDGPUSubtarget::GFX9; 422 case AMDGPU::V_MADAK_F16: 423 case AMDGPU::V_MADMK_F16: 424 case AMDGPU::V_MAC_F16_e64: 425 case AMDGPU::V_MAC_F16_e32: 426 case AMDGPU::V_FMAMK_F16: 427 case AMDGPU::V_FMAAK_F16: 428 case AMDGPU::V_FMAC_F16_e64: 429 case AMDGPU::V_FMAC_F16_e32: 430 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 431 // instructions maintain the legacy behavior of 0ing. Some instructions 432 // changed to preserving the high bits. 433 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 434 case AMDGPU::V_MAD_MIXLO_F16: 435 case AMDGPU::V_MAD_MIXHI_F16: 436 default: 437 return false; 438 } 439 } 440 441 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 442 const Function &F) const { 443 if (NWaves == 1) 444 return getLocalMemorySize(); 445 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 446 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 447 if (!WorkGroupsPerCu) 448 return 0; 449 unsigned MaxWaves = getMaxWavesPerEU(); 450 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 451 } 452 453 // FIXME: Should return min,max range. 454 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 455 const Function &F) const { 456 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 457 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 458 if (!MaxWorkGroupsPerCu) 459 return 0; 460 461 const unsigned WaveSize = getWavefrontSize(); 462 463 // FIXME: Do we need to account for alignment requirement of LDS rounding the 464 // size up? 465 // Compute restriction based on LDS usage 466 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 467 468 // This can be queried with more LDS than is possible, so just assume the 469 // worst. 470 if (NumGroups == 0) 471 return 1; 472 473 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 474 475 // Round to the number of waves. 476 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 477 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 478 479 // Clamp to the maximum possible number of waves. 480 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 481 482 // FIXME: Needs to be a multiple of the group size? 483 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 484 485 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 486 "computed invalid occupancy"); 487 return MaxWaves; 488 } 489 490 unsigned 491 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 492 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 493 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 494 } 495 496 std::pair<unsigned, unsigned> 497 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 498 switch (CC) { 499 case CallingConv::AMDGPU_VS: 500 case CallingConv::AMDGPU_LS: 501 case CallingConv::AMDGPU_HS: 502 case CallingConv::AMDGPU_ES: 503 case CallingConv::AMDGPU_GS: 504 case CallingConv::AMDGPU_PS: 505 return std::make_pair(1, getWavefrontSize()); 506 default: 507 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 508 } 509 } 510 511 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 512 const Function &F) const { 513 // Default minimum/maximum flat work group sizes. 514 std::pair<unsigned, unsigned> Default = 515 getDefaultFlatWorkGroupSize(F.getCallingConv()); 516 517 // Requested minimum/maximum flat work group sizes. 518 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 519 F, "amdgpu-flat-work-group-size", Default); 520 521 // Make sure requested minimum is less than requested maximum. 522 if (Requested.first > Requested.second) 523 return Default; 524 525 // Make sure requested values do not violate subtarget's specifications. 526 if (Requested.first < getMinFlatWorkGroupSize()) 527 return Default; 528 if (Requested.second > getMaxFlatWorkGroupSize()) 529 return Default; 530 531 return Requested; 532 } 533 534 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 535 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 536 // Default minimum/maximum number of waves per execution unit. 537 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 538 539 // If minimum/maximum flat work group sizes were explicitly requested using 540 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 541 // number of waves per execution unit to values implied by requested 542 // minimum/maximum flat work group sizes. 543 unsigned MinImpliedByFlatWorkGroupSize = 544 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 545 Default.first = MinImpliedByFlatWorkGroupSize; 546 547 // Requested minimum/maximum number of waves per execution unit. 548 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 549 F, "amdgpu-waves-per-eu", Default, true); 550 551 // Make sure requested minimum is less than requested maximum. 552 if (Requested.second && Requested.first > Requested.second) 553 return Default; 554 555 // Make sure requested values do not violate subtarget's specifications. 556 if (Requested.first < getMinWavesPerEU() || 557 Requested.second > getMaxWavesPerEU()) 558 return Default; 559 560 // Make sure requested values are compatible with values implied by requested 561 // minimum/maximum flat work group sizes. 562 if (Requested.first < MinImpliedByFlatWorkGroupSize) 563 return Default; 564 565 return Requested; 566 } 567 568 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 569 auto Node = Kernel.getMetadata("reqd_work_group_size"); 570 if (Node && Node->getNumOperands() == 3) 571 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 572 return std::numeric_limits<unsigned>::max(); 573 } 574 575 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 576 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 577 } 578 579 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 580 unsigned Dimension) const { 581 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 582 if (ReqdSize != std::numeric_limits<unsigned>::max()) 583 return ReqdSize - 1; 584 return getFlatWorkGroupSizes(Kernel).second - 1; 585 } 586 587 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 588 Function *Kernel = I->getParent()->getParent(); 589 unsigned MinSize = 0; 590 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 591 bool IdQuery = false; 592 593 // If reqd_work_group_size is present it narrows value down. 594 if (auto *CI = dyn_cast<CallInst>(I)) { 595 const Function *F = CI->getCalledFunction(); 596 if (F) { 597 unsigned Dim = UINT_MAX; 598 switch (F->getIntrinsicID()) { 599 case Intrinsic::amdgcn_workitem_id_x: 600 case Intrinsic::r600_read_tidig_x: 601 IdQuery = true; 602 LLVM_FALLTHROUGH; 603 case Intrinsic::r600_read_local_size_x: 604 Dim = 0; 605 break; 606 case Intrinsic::amdgcn_workitem_id_y: 607 case Intrinsic::r600_read_tidig_y: 608 IdQuery = true; 609 LLVM_FALLTHROUGH; 610 case Intrinsic::r600_read_local_size_y: 611 Dim = 1; 612 break; 613 case Intrinsic::amdgcn_workitem_id_z: 614 case Intrinsic::r600_read_tidig_z: 615 IdQuery = true; 616 LLVM_FALLTHROUGH; 617 case Intrinsic::r600_read_local_size_z: 618 Dim = 2; 619 break; 620 default: 621 break; 622 } 623 624 if (Dim <= 3) { 625 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 626 if (ReqdSize != std::numeric_limits<unsigned>::max()) 627 MinSize = MaxSize = ReqdSize; 628 } 629 } 630 } 631 632 if (!MaxSize) 633 return false; 634 635 // Range metadata is [Lo, Hi). For ID query we need to pass max size 636 // as Hi. For size query we need to pass Hi + 1. 637 if (IdQuery) 638 MinSize = 0; 639 else 640 ++MaxSize; 641 642 MDBuilder MDB(I->getContext()); 643 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 644 APInt(32, MaxSize)); 645 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 646 return true; 647 } 648 649 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 650 assert(AMDGPU::isKernel(F.getCallingConv())); 651 652 // We don't allocate the segment if we know the implicit arguments weren't 653 // used, even if the ABI implies we need them. 654 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 655 return 0; 656 657 if (isMesaKernel(F)) 658 return 16; 659 660 // Assume all implicit inputs are used by default 661 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); 662 } 663 664 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 665 Align &MaxAlign) const { 666 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 667 F.getCallingConv() == CallingConv::SPIR_KERNEL); 668 669 const DataLayout &DL = F.getParent()->getDataLayout(); 670 uint64_t ExplicitArgBytes = 0; 671 MaxAlign = Align(1); 672 673 for (const Argument &Arg : F.args()) { 674 const bool IsByRef = Arg.hasByRefAttr(); 675 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 676 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 677 if (!Alignment) 678 Alignment = DL.getABITypeAlign(ArgTy); 679 680 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 681 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 682 MaxAlign = max(MaxAlign, Alignment); 683 } 684 685 return ExplicitArgBytes; 686 } 687 688 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 689 Align &MaxAlign) const { 690 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 691 692 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 693 694 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 695 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 696 if (ImplicitBytes != 0) { 697 const Align Alignment = getAlignmentForImplicitArgPtr(); 698 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 699 MaxAlign = std::max(MaxAlign, Alignment); 700 } 701 702 // Being able to dereference past the end is useful for emitting scalar loads. 703 return alignTo(TotalSize, 4); 704 } 705 706 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 707 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 708 : AMDGPUDwarfFlavour::Wave64; 709 } 710 711 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 712 unsigned NumRegionInstrs) const { 713 // Track register pressure so the scheduler can try to decrease 714 // pressure once register usage is above the threshold defined by 715 // SIRegisterInfo::getRegPressureSetLimit() 716 Policy.ShouldTrackPressure = true; 717 718 // Enabling both top down and bottom up scheduling seems to give us less 719 // register spills than just using one of these approaches on its own. 720 Policy.OnlyTopDown = false; 721 Policy.OnlyBottomUp = false; 722 723 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 724 if (!enableSIScheduler()) 725 Policy.ShouldTrackLaneMasks = true; 726 } 727 728 bool GCNSubtarget::hasMadF16() const { 729 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 730 } 731 732 bool GCNSubtarget::useVGPRIndexMode() const { 733 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 734 } 735 736 bool GCNSubtarget::useAA() const { return UseAA; } 737 738 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 739 if (getGeneration() >= AMDGPUSubtarget::GFX10) 740 return getMaxWavesPerEU(); 741 742 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 743 if (SGPRs <= 80) 744 return 10; 745 if (SGPRs <= 88) 746 return 9; 747 if (SGPRs <= 100) 748 return 8; 749 return 7; 750 } 751 if (SGPRs <= 48) 752 return 10; 753 if (SGPRs <= 56) 754 return 9; 755 if (SGPRs <= 64) 756 return 8; 757 if (SGPRs <= 72) 758 return 7; 759 if (SGPRs <= 80) 760 return 6; 761 return 5; 762 } 763 764 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 765 unsigned MaxWaves = getMaxWavesPerEU(); 766 unsigned Granule = getVGPRAllocGranule(); 767 if (VGPRs < Granule) 768 return MaxWaves; 769 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 770 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 771 } 772 773 unsigned 774 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 775 if (getGeneration() >= AMDGPUSubtarget::GFX10) 776 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 777 778 if (HasFlatScratch || HasArchitectedFlatScratch) { 779 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 780 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 781 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 782 return 4; // FLAT_SCRATCH, VCC (in that order). 783 } 784 785 if (isXNACKEnabled()) 786 return 4; // XNACK, VCC (in that order). 787 return 2; // VCC. 788 } 789 790 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 791 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 792 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 793 } 794 795 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 796 // In principle we do not need to reserve SGPR pair used for flat_scratch if 797 // we know flat instructions do not access the stack anywhere in the 798 // program. For now assume it's needed if we have flat instructions. 799 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 800 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 801 } 802 803 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 804 unsigned NumSGPRs, 805 unsigned NumVGPRs) const { 806 unsigned Occupancy = 807 std::min(getMaxWavesPerEU(), 808 getOccupancyWithLocalMemSize(LDSSize, F)); 809 if (NumSGPRs) 810 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 811 if (NumVGPRs) 812 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 813 return Occupancy; 814 } 815 816 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 817 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 818 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 819 // Compute maximum number of SGPRs function can use using default/requested 820 // minimum number of waves per execution unit. 821 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 822 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 823 824 // Check if maximum number of SGPRs was explicitly requested using 825 // "amdgpu-num-sgpr" attribute. 826 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 827 unsigned Requested = AMDGPU::getIntegerAttribute( 828 F, "amdgpu-num-sgpr", MaxNumSGPRs); 829 830 // Make sure requested value does not violate subtarget's specifications. 831 if (Requested && (Requested <= ReservedNumSGPRs)) 832 Requested = 0; 833 834 // If more SGPRs are required to support the input user/system SGPRs, 835 // increase to accommodate them. 836 // 837 // FIXME: This really ends up using the requested number of SGPRs + number 838 // of reserved special registers in total. Theoretically you could re-use 839 // the last input registers for these special registers, but this would 840 // require a lot of complexity to deal with the weird aliasing. 841 unsigned InputNumSGPRs = PreloadedSGPRs; 842 if (Requested && Requested < InputNumSGPRs) 843 Requested = InputNumSGPRs; 844 845 // Make sure requested value is compatible with values implied by 846 // default/requested minimum/maximum number of waves per execution unit. 847 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 848 Requested = 0; 849 if (WavesPerEU.second && 850 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 851 Requested = 0; 852 853 if (Requested) 854 MaxNumSGPRs = Requested; 855 } 856 857 if (hasSGPRInitBug()) 858 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 859 860 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 861 } 862 863 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 864 const Function &F = MF.getFunction(); 865 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 866 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 867 getReservedNumSGPRs(MF)); 868 } 869 870 static unsigned getMaxNumPreloadedSGPRs() { 871 // Max number of user SGPRs 872 unsigned MaxUserSGPRs = 4 + // private segment buffer 873 2 + // Dispatch ptr 874 2 + // queue ptr 875 2 + // kernel segment ptr 876 2 + // dispatch ID 877 2 + // flat scratch init 878 2; // Implicit buffer ptr 879 // Max number of system SGPRs 880 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 881 1 + // WorkGroupIDY 882 1 + // WorkGroupIDZ 883 1 + // WorkGroupInfo 884 1; // private segment wave byte offset 885 return MaxUserSGPRs + MaxSystemSGPRs; 886 } 887 888 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 889 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 890 getReservedNumSGPRs(F)); 891 } 892 893 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 894 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 895 // Compute maximum number of VGPRs function can use using default/requested 896 // minimum number of waves per execution unit. 897 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 898 899 // Check if maximum number of VGPRs was explicitly requested using 900 // "amdgpu-num-vgpr" attribute. 901 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 902 unsigned Requested = AMDGPU::getIntegerAttribute( 903 F, "amdgpu-num-vgpr", MaxNumVGPRs); 904 905 if (hasGFX90AInsts()) 906 Requested *= 2; 907 908 // Make sure requested value is compatible with values implied by 909 // default/requested minimum/maximum number of waves per execution unit. 910 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 911 Requested = 0; 912 if (WavesPerEU.second && 913 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 914 Requested = 0; 915 916 if (Requested) 917 MaxNumVGPRs = Requested; 918 } 919 920 return MaxNumVGPRs; 921 } 922 923 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 924 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 925 } 926 927 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 928 const Function &F = MF.getFunction(); 929 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 930 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 931 } 932 933 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 934 int UseOpIdx, SDep &Dep) const { 935 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 936 !Def->isInstr() || !Use->isInstr()) 937 return; 938 939 MachineInstr *DefI = Def->getInstr(); 940 MachineInstr *UseI = Use->getInstr(); 941 942 if (DefI->isBundle()) { 943 const SIRegisterInfo *TRI = getRegisterInfo(); 944 auto Reg = Dep.getReg(); 945 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 946 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 947 unsigned Lat = 0; 948 for (++I; I != E && I->isBundledWithPred(); ++I) { 949 if (I->modifiesRegister(Reg, TRI)) 950 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 951 else if (Lat) 952 --Lat; 953 } 954 Dep.setLatency(Lat); 955 } else if (UseI->isBundle()) { 956 const SIRegisterInfo *TRI = getRegisterInfo(); 957 auto Reg = Dep.getReg(); 958 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 959 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 960 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 961 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 962 if (I->readsRegister(Reg, TRI)) 963 break; 964 --Lat; 965 } 966 Dep.setLatency(Lat); 967 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 968 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 969 // implicit operands which come from the MCInstrDesc, which can fool 970 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 971 // pseudo operands. 972 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 973 DefI, DefOpIdx, UseI, UseOpIdx)); 974 } 975 } 976 977 namespace { 978 struct FillMFMAShadowMutation : ScheduleDAGMutation { 979 const SIInstrInfo *TII; 980 981 ScheduleDAGMI *DAG; 982 983 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 984 985 bool isSALU(const SUnit *SU) const { 986 const MachineInstr *MI = SU->getInstr(); 987 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 988 } 989 990 bool isVALU(const SUnit *SU) const { 991 const MachineInstr *MI = SU->getInstr(); 992 return MI && TII->isVALU(*MI); 993 } 994 995 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 996 if (Pred->NodeNum < Succ->NodeNum) 997 return true; 998 999 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 1000 1001 for (unsigned I = 0; I < Succs.size(); ++I) { 1002 for (const SDep &SI : Succs[I]->Succs) { 1003 const SUnit *SU = SI.getSUnit(); 1004 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 1005 Succs.push_back(SU); 1006 } 1007 } 1008 1009 SmallPtrSet<const SUnit*, 32> Visited; 1010 while (!Preds.empty()) { 1011 const SUnit *SU = Preds.pop_back_val(); 1012 if (llvm::is_contained(Succs, SU)) 1013 return false; 1014 Visited.insert(SU); 1015 for (const SDep &SI : SU->Preds) 1016 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1017 Preds.push_back(SI.getSUnit()); 1018 } 1019 1020 return true; 1021 } 1022 1023 // Link as many SALU instructions in chain as possible. Return the size 1024 // of the chain. Links up to MaxChain instructions. 1025 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1026 SmallPtrSetImpl<SUnit *> &Visited) const { 1027 SmallVector<SUnit *, 8> Worklist({To}); 1028 unsigned Linked = 0; 1029 1030 while (!Worklist.empty() && MaxChain-- > 0) { 1031 SUnit *SU = Worklist.pop_back_val(); 1032 if (!Visited.insert(SU).second) 1033 continue; 1034 1035 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1036 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1037 1038 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1039 ++Linked; 1040 1041 for (SDep &SI : From->Succs) { 1042 SUnit *SUv = SI.getSUnit(); 1043 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1044 SUv->addPred(SDep(SU, SDep::Artificial), false); 1045 } 1046 1047 for (SDep &SI : SU->Succs) { 1048 SUnit *Succ = SI.getSUnit(); 1049 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1050 Worklist.push_back(Succ); 1051 } 1052 } 1053 1054 return Linked; 1055 } 1056 1057 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1058 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1059 if (!ST.hasMAIInsts() || DisablePowerSched) 1060 return; 1061 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1062 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1063 if (!TSchedModel || DAG->SUnits.empty()) 1064 return; 1065 1066 // Scan for MFMA long latency instructions and try to add a dependency 1067 // of available SALU instructions to give them a chance to fill MFMA 1068 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1069 // rather than VALU to prevent power consumption bursts and throttle. 1070 auto LastSALU = DAG->SUnits.begin(); 1071 auto E = DAG->SUnits.end(); 1072 SmallPtrSet<SUnit*, 32> Visited; 1073 for (SUnit &SU : DAG->SUnits) { 1074 MachineInstr &MAI = *SU.getInstr(); 1075 if (!TII->isMAI(MAI) || 1076 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1077 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1078 continue; 1079 1080 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1081 1082 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1083 dbgs() << "Need " << Lat 1084 << " instructions to cover latency.\n"); 1085 1086 // Find up to Lat independent scalar instructions as early as 1087 // possible such that they can be scheduled after this MFMA. 1088 for ( ; Lat && LastSALU != E; ++LastSALU) { 1089 if (Visited.count(&*LastSALU)) 1090 continue; 1091 1092 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1093 continue; 1094 1095 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1096 } 1097 } 1098 } 1099 }; 1100 } // namespace 1101 1102 void GCNSubtarget::getPostRAMutations( 1103 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1104 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1105 } 1106 1107 std::unique_ptr<ScheduleDAGMutation> 1108 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1109 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); 1110 } 1111 1112 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1113 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1114 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1115 else 1116 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1117 } 1118 1119 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1120 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1121 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1122 else 1123 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1124 } 1125