1 //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "GCNSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSelectionDAGInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/MDBuilder.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "gcn-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #undef AMDGPUSubtarget 40 41 static cl::opt<bool> EnableVGPRIndexMode( 42 "amdgpu-vgpr-index-mode", 43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 44 cl::init(false)); 45 46 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 47 cl::desc("Enable the use of AA during codegen."), 48 cl::init(true)); 49 50 static cl::opt<unsigned> 51 NSAThreshold("amdgpu-nsa-threshold", 52 cl::desc("Number of addresses from which to enable MIMG NSA."), 53 cl::init(2), cl::Hidden); 54 55 GCNSubtarget::~GCNSubtarget() = default; 56 57 GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 58 StringRef GPU, 59 StringRef FS) { 60 // Determine default and user-specified characteristics 61 // 62 // We want to be able to turn these off, but making this a subtarget feature 63 // for SI has the unhelpful behavior that it unsets everything else if you 64 // disable it. 65 // 66 // Similarly we want enable-prt-strict-null to be on by default and not to 67 // unset everything else if it is disabled 68 69 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 70 71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by 72 // default 73 if (isAmdHsaOS()) 74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 75 76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 77 78 // Disable mutually exclusive bits. 79 if (FS.contains_insensitive("+wavefrontsize")) { 80 if (!FS.contains_insensitive("wavefrontsize16")) 81 FullFS += "-wavefrontsize16,"; 82 if (!FS.contains_insensitive("wavefrontsize32")) 83 FullFS += "-wavefrontsize32,"; 84 if (!FS.contains_insensitive("wavefrontsize64")) 85 FullFS += "-wavefrontsize64,"; 86 } 87 88 FullFS += FS; 89 90 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 91 92 // Implement the "generic" processors, which acts as the default when no 93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 94 // the first amdgcn target that supports flat addressing. Other OSes defaults 95 // to the first amdgcn target. 96 if (Gen == AMDGPUSubtarget::INVALID) { 97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 98 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 99 // Assume wave64 for the unknown target, if not explicitly set. 100 if (getWavefrontSizeLog2() == 0) 101 WavefrontSizeLog2 = 6; 102 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && 103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) { 104 // If there is no default wave size it must be a generation before gfx10, 105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+ 106 // set wave32 as a default. 107 ToggleFeature(AMDGPU::FeatureWavefrontSize32); 108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6; 109 } 110 111 // We don't support FP64 for EG/NI atm. 112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113 114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 115 // support flat operations, otherwise they cannot access a 64-bit global 116 // address space 117 assert(hasAddr64() || hasFlat()); 118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 119 // that do not support ADDR64 variants of MUBUF instructions. Such targets 120 // cannot use a 64 bit offset with a MUBUF instruction to access the global 121 // address space 122 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 123 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 124 FlatForGlobal = true; 125 } 126 // Unless +-flat-for-global is specified, use MUBUF instructions for global 127 // address space access if flat operations are not available. 128 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 129 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 130 FlatForGlobal = false; 131 } 132 133 // Set defaults if needed. 134 if (MaxPrivateElementSize == 0) 135 MaxPrivateElementSize = 4; 136 137 if (LDSBankCount == 0) 138 LDSBankCount = 32; 139 140 if (TT.isAMDGCN() && AddressableLocalMemorySize == 0) 141 AddressableLocalMemorySize = 32768; 142 143 LocalMemorySize = AddressableLocalMemorySize; 144 if (AMDGPU::isGFX10Plus(*this) && 145 !getFeatureBits().test(AMDGPU::FeatureCuMode)) 146 LocalMemorySize *= 2; 147 148 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 149 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 150 151 TargetID.setTargetIDFromFeaturesString(FS); 152 153 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 154 << TargetID.getXnackSetting() << '\n'); 155 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 156 << TargetID.getSramEccSetting() << '\n'); 157 158 return *this; 159 } 160 161 void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { 162 LLVMContext &Ctx = F.getContext(); 163 if (hasFeature(AMDGPU::FeatureWavefrontSize32) && 164 hasFeature(AMDGPU::FeatureWavefrontSize64)) { 165 Ctx.diagnose(DiagnosticInfoUnsupported( 166 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); 167 } 168 } 169 170 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 171 const GCNTargetMachine &TM) 172 : // clang-format off 173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 TargetID(*this), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 179 TLInfo(TM, *this), 180 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 181 // clang-format on 182 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 183 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); 184 185 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>(); 186 187 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering()); 188 InlineAsmLoweringInfo = 189 std::make_unique<InlineAsmLowering>(getTargetLowering()); 190 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM); 191 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this); 192 InstSelector = 193 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM); 194 } 195 196 const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const { 197 return TSInfo.get(); 198 } 199 200 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 201 if (getGeneration() < GFX10) 202 return 1; 203 204 switch (Opcode) { 205 case AMDGPU::V_LSHLREV_B64_e64: 206 case AMDGPU::V_LSHLREV_B64_gfx10: 207 case AMDGPU::V_LSHLREV_B64_e64_gfx11: 208 case AMDGPU::V_LSHLREV_B64_e32_gfx12: 209 case AMDGPU::V_LSHLREV_B64_e64_gfx12: 210 case AMDGPU::V_LSHL_B64_e64: 211 case AMDGPU::V_LSHRREV_B64_e64: 212 case AMDGPU::V_LSHRREV_B64_gfx10: 213 case AMDGPU::V_LSHRREV_B64_e64_gfx11: 214 case AMDGPU::V_LSHRREV_B64_e64_gfx12: 215 case AMDGPU::V_LSHR_B64_e64: 216 case AMDGPU::V_ASHRREV_I64_e64: 217 case AMDGPU::V_ASHRREV_I64_gfx10: 218 case AMDGPU::V_ASHRREV_I64_e64_gfx11: 219 case AMDGPU::V_ASHRREV_I64_e64_gfx12: 220 case AMDGPU::V_ASHR_I64_e64: 221 return 1; 222 } 223 224 return 2; 225 } 226 227 /// This list was mostly derived from experimentation. 228 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 229 switch (Opcode) { 230 case AMDGPU::V_CVT_F16_F32_e32: 231 case AMDGPU::V_CVT_F16_F32_e64: 232 case AMDGPU::V_CVT_F16_U16_e32: 233 case AMDGPU::V_CVT_F16_U16_e64: 234 case AMDGPU::V_CVT_F16_I16_e32: 235 case AMDGPU::V_CVT_F16_I16_e64: 236 case AMDGPU::V_RCP_F16_e64: 237 case AMDGPU::V_RCP_F16_e32: 238 case AMDGPU::V_RSQ_F16_e64: 239 case AMDGPU::V_RSQ_F16_e32: 240 case AMDGPU::V_SQRT_F16_e64: 241 case AMDGPU::V_SQRT_F16_e32: 242 case AMDGPU::V_LOG_F16_e64: 243 case AMDGPU::V_LOG_F16_e32: 244 case AMDGPU::V_EXP_F16_e64: 245 case AMDGPU::V_EXP_F16_e32: 246 case AMDGPU::V_SIN_F16_e64: 247 case AMDGPU::V_SIN_F16_e32: 248 case AMDGPU::V_COS_F16_e64: 249 case AMDGPU::V_COS_F16_e32: 250 case AMDGPU::V_FLOOR_F16_e64: 251 case AMDGPU::V_FLOOR_F16_e32: 252 case AMDGPU::V_CEIL_F16_e64: 253 case AMDGPU::V_CEIL_F16_e32: 254 case AMDGPU::V_TRUNC_F16_e64: 255 case AMDGPU::V_TRUNC_F16_e32: 256 case AMDGPU::V_RNDNE_F16_e64: 257 case AMDGPU::V_RNDNE_F16_e32: 258 case AMDGPU::V_FRACT_F16_e64: 259 case AMDGPU::V_FRACT_F16_e32: 260 case AMDGPU::V_FREXP_MANT_F16_e64: 261 case AMDGPU::V_FREXP_MANT_F16_e32: 262 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 263 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 264 case AMDGPU::V_LDEXP_F16_e64: 265 case AMDGPU::V_LDEXP_F16_e32: 266 case AMDGPU::V_LSHLREV_B16_e64: 267 case AMDGPU::V_LSHLREV_B16_e32: 268 case AMDGPU::V_LSHRREV_B16_e64: 269 case AMDGPU::V_LSHRREV_B16_e32: 270 case AMDGPU::V_ASHRREV_I16_e64: 271 case AMDGPU::V_ASHRREV_I16_e32: 272 case AMDGPU::V_ADD_U16_e64: 273 case AMDGPU::V_ADD_U16_e32: 274 case AMDGPU::V_SUB_U16_e64: 275 case AMDGPU::V_SUB_U16_e32: 276 case AMDGPU::V_SUBREV_U16_e64: 277 case AMDGPU::V_SUBREV_U16_e32: 278 case AMDGPU::V_MUL_LO_U16_e64: 279 case AMDGPU::V_MUL_LO_U16_e32: 280 case AMDGPU::V_ADD_F16_e64: 281 case AMDGPU::V_ADD_F16_e32: 282 case AMDGPU::V_SUB_F16_e64: 283 case AMDGPU::V_SUB_F16_e32: 284 case AMDGPU::V_SUBREV_F16_e64: 285 case AMDGPU::V_SUBREV_F16_e32: 286 case AMDGPU::V_MUL_F16_e64: 287 case AMDGPU::V_MUL_F16_e32: 288 case AMDGPU::V_MAX_F16_e64: 289 case AMDGPU::V_MAX_F16_e32: 290 case AMDGPU::V_MIN_F16_e64: 291 case AMDGPU::V_MIN_F16_e32: 292 case AMDGPU::V_MAX_U16_e64: 293 case AMDGPU::V_MAX_U16_e32: 294 case AMDGPU::V_MIN_U16_e64: 295 case AMDGPU::V_MIN_U16_e32: 296 case AMDGPU::V_MAX_I16_e64: 297 case AMDGPU::V_MAX_I16_e32: 298 case AMDGPU::V_MIN_I16_e64: 299 case AMDGPU::V_MIN_I16_e32: 300 case AMDGPU::V_MAD_F16_e64: 301 case AMDGPU::V_MAD_U16_e64: 302 case AMDGPU::V_MAD_I16_e64: 303 case AMDGPU::V_FMA_F16_e64: 304 case AMDGPU::V_DIV_FIXUP_F16_e64: 305 // On gfx10, all 16-bit instructions preserve the high bits. 306 return getGeneration() <= AMDGPUSubtarget::GFX9; 307 case AMDGPU::V_MADAK_F16: 308 case AMDGPU::V_MADMK_F16: 309 case AMDGPU::V_MAC_F16_e64: 310 case AMDGPU::V_MAC_F16_e32: 311 case AMDGPU::V_FMAMK_F16: 312 case AMDGPU::V_FMAAK_F16: 313 case AMDGPU::V_FMAC_F16_e64: 314 case AMDGPU::V_FMAC_F16_e32: 315 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 316 // instructions maintain the legacy behavior of 0ing. Some instructions 317 // changed to preserving the high bits. 318 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 319 case AMDGPU::V_MAD_MIXLO_F16: 320 case AMDGPU::V_MAD_MIXHI_F16: 321 default: 322 return false; 323 } 324 } 325 326 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 327 unsigned NumRegionInstrs) const { 328 // Track register pressure so the scheduler can try to decrease 329 // pressure once register usage is above the threshold defined by 330 // SIRegisterInfo::getRegPressureSetLimit() 331 Policy.ShouldTrackPressure = true; 332 333 // Enabling both top down and bottom up scheduling seems to give us less 334 // register spills than just using one of these approaches on its own. 335 Policy.OnlyTopDown = false; 336 Policy.OnlyBottomUp = false; 337 338 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 339 if (!enableSIScheduler()) 340 Policy.ShouldTrackLaneMasks = true; 341 } 342 343 void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { 344 if (isWave32()) { 345 // Fix implicit $vcc operands after MIParser has verified that they match 346 // the instruction definitions. 347 for (auto &MBB : MF) { 348 for (auto &MI : MBB) 349 InstrInfo.fixImplicitOperands(MI); 350 } 351 } 352 } 353 354 bool GCNSubtarget::hasMadF16() const { 355 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 356 } 357 358 bool GCNSubtarget::useVGPRIndexMode() const { 359 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode); 360 } 361 362 bool GCNSubtarget::useAA() const { return UseAA; } 363 364 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 365 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), 366 getGeneration()); 367 } 368 369 unsigned 370 GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs, 371 unsigned DynamicVGPRBlockSize) const { 372 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs, 373 DynamicVGPRBlockSize); 374 } 375 376 unsigned 377 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 378 if (getGeneration() >= AMDGPUSubtarget::GFX10) 379 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 380 381 if (HasFlatScratch || HasArchitectedFlatScratch) { 382 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 383 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 384 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 385 return 4; // FLAT_SCRATCH, VCC (in that order). 386 } 387 388 if (isXNACKEnabled()) 389 return 4; // XNACK, VCC (in that order). 390 return 2; // VCC. 391 } 392 393 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 394 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 395 return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); 396 } 397 398 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 399 // In principle we do not need to reserve SGPR pair used for flat_scratch if 400 // we know flat instructions do not access the stack anywhere in the 401 // program. For now assume it's needed if we have flat instructions. 402 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 403 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 404 } 405 406 std::pair<unsigned, unsigned> 407 GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 408 unsigned NumSGPRs, unsigned NumVGPRs) const { 409 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F); 410 // Temporarily check both the attribute and the subtarget feature until the 411 // latter is removed. 412 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled()) 413 DynamicVGPRBlockSize = getDynamicVGPRBlockSize(); 414 415 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); 416 unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); 417 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize); 418 419 // Maximum occupancy may be further limited by high SGPR/VGPR usage. 420 MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc)); 421 return {std::min(MinOcc, MaxOcc), MaxOcc}; 422 } 423 424 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 425 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 426 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 427 // Compute maximum number of SGPRs function can use using default/requested 428 // minimum number of waves per execution unit. 429 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 430 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 431 432 // Check if maximum number of SGPRs was explicitly requested using 433 // "amdgpu-num-sgpr" attribute. 434 unsigned Requested = 435 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); 436 437 if (Requested != MaxNumSGPRs) { 438 // Make sure requested value does not violate subtarget's specifications. 439 if (Requested && (Requested <= ReservedNumSGPRs)) 440 Requested = 0; 441 442 // If more SGPRs are required to support the input user/system SGPRs, 443 // increase to accommodate them. 444 // 445 // FIXME: This really ends up using the requested number of SGPRs + number 446 // of reserved special registers in total. Theoretically you could re-use 447 // the last input registers for these special registers, but this would 448 // require a lot of complexity to deal with the weird aliasing. 449 unsigned InputNumSGPRs = PreloadedSGPRs; 450 if (Requested && Requested < InputNumSGPRs) 451 Requested = InputNumSGPRs; 452 453 // Make sure requested value is compatible with values implied by 454 // default/requested minimum/maximum number of waves per execution unit. 455 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 456 Requested = 0; 457 if (WavesPerEU.second && Requested && 458 Requested < getMinNumSGPRs(WavesPerEU.second)) 459 Requested = 0; 460 461 if (Requested) 462 MaxNumSGPRs = Requested; 463 } 464 465 if (hasSGPRInitBug()) 466 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 467 468 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 469 } 470 471 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 472 const Function &F = MF.getFunction(); 473 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 474 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 475 getReservedNumSGPRs(MF)); 476 } 477 478 unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const { 479 using USI = GCNUserSGPRUsageInfo; 480 // Max number of user SGPRs 481 const unsigned MaxUserSGPRs = 482 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + 483 USI::getNumUserSGPRForField(USI::DispatchPtrID) + 484 USI::getNumUserSGPRForField(USI::QueuePtrID) + 485 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + 486 USI::getNumUserSGPRForField(USI::DispatchIdID) + 487 USI::getNumUserSGPRForField(USI::FlatScratchInitID) + 488 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); 489 490 // Max number of system SGPRs 491 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 492 1 + // WorkGroupIDY 493 1 + // WorkGroupIDZ 494 1 + // WorkGroupInfo 495 1; // private segment wave byte offset 496 497 // Max number of synthetic SGPRs 498 const unsigned SyntheticSGPRs = 1; // LDSKernelId 499 500 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; 501 } 502 503 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 504 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 505 getReservedNumSGPRs(F)); 506 } 507 508 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 509 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const { 510 const auto &[Min, Max] = NumVGPRBounds; 511 512 // Check if maximum number of VGPRs was explicitly requested using 513 // "amdgpu-num-vgpr" attribute. 514 515 unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max); 516 if (Requested != Max && hasGFX90AInsts()) 517 Requested *= 2; 518 519 // Make sure requested value is inside the range of possible VGPR usage. 520 return std::clamp(Requested, Min, Max); 521 } 522 523 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 524 // Temporarily check both the attribute and the subtarget feature, until the 525 // latter is removed. 526 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F); 527 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled()) 528 DynamicVGPRBlockSize = getDynamicVGPRBlockSize(); 529 530 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F); 531 return getBaseMaxNumVGPRs( 532 F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize), 533 getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)}); 534 } 535 536 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 537 return getMaxNumVGPRs(MF.getFunction()); 538 } 539 540 void GCNSubtarget::adjustSchedDependency( 541 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, 542 const TargetSchedModel *SchedModel) const { 543 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() || 544 !Use->isInstr()) 545 return; 546 547 MachineInstr *DefI = Def->getInstr(); 548 MachineInstr *UseI = Use->getInstr(); 549 550 if (DefI->isBundle()) { 551 const SIRegisterInfo *TRI = getRegisterInfo(); 552 auto Reg = Dep.getReg(); 553 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 554 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 555 unsigned Lat = 0; 556 for (++I; I != E && I->isBundledWithPred(); ++I) { 557 if (I->modifiesRegister(Reg, TRI)) 558 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 559 else if (Lat) 560 --Lat; 561 } 562 Dep.setLatency(Lat); 563 } else if (UseI->isBundle()) { 564 const SIRegisterInfo *TRI = getRegisterInfo(); 565 auto Reg = Dep.getReg(); 566 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 567 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 568 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 569 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 570 if (I->readsRegister(Reg, TRI)) 571 break; 572 --Lat; 573 } 574 Dep.setLatency(Lat); 575 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 576 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 577 // implicit operands which come from the MCInstrDesc, which can fool 578 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 579 // pseudo operands. 580 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 581 DefI, DefOpIdx, UseI, UseOpIdx)); 582 } 583 } 584 585 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { 586 if (getGeneration() >= AMDGPUSubtarget::GFX12) 587 return 0; // Not MIMG encoding. 588 589 if (NSAThreshold.getNumOccurrences() > 0) 590 return std::max(NSAThreshold.getValue(), 2u); 591 592 int Value = MF.getFunction().getFnAttributeAsParsedInteger( 593 "amdgpu-nsa-threshold", -1); 594 if (Value > 0) 595 return std::max(Value, 2); 596 597 return NSAThreshold; 598 } 599 600 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, 601 const GCNSubtarget &ST) 602 : ST(ST) { 603 const CallingConv::ID CC = F.getCallingConv(); 604 const bool IsKernel = 605 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; 606 607 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) 608 KernargSegmentPtr = true; 609 610 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); 611 if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) 612 PrivateSegmentBuffer = true; 613 else if (ST.isMesaGfxShader(F)) 614 ImplicitBufferPtr = true; 615 616 if (!AMDGPU::isGraphics(CC)) { 617 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) 618 DispatchPtr = true; 619 620 // FIXME: Can this always be disabled with < COv5? 621 if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) 622 QueuePtr = true; 623 624 if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) 625 DispatchID = true; 626 } 627 628 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && 629 (IsAmdHsaOrMesa || ST.enableFlatScratch()) && 630 // FlatScratchInit cannot be true for graphics CC if enableFlatScratch() 631 // is false. 632 (ST.enableFlatScratch() || 633 (!AMDGPU::isGraphics(CC) && 634 !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) && 635 !ST.flatScratchIsArchitected()) { 636 FlatScratchInit = true; 637 } 638 639 if (hasImplicitBufferPtr()) 640 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); 641 642 if (hasPrivateSegmentBuffer()) 643 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); 644 645 if (hasDispatchPtr()) 646 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); 647 648 if (hasQueuePtr()) 649 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); 650 651 if (hasKernargSegmentPtr()) 652 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); 653 654 if (hasDispatchID()) 655 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); 656 657 if (hasFlatScratchInit()) 658 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); 659 660 if (hasPrivateSegmentSize()) 661 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); 662 } 663 664 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { 665 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); 666 NumKernargPreloadSGPRs += NumSGPRs; 667 NumUsedUserSGPRs += NumSGPRs; 668 } 669 670 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { 671 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; 672 } 673