1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "GCNSubtarget.h" 21 #include "R600Subtarget.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 26 #include "llvm/CodeGen/MachineScheduler.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 #include "llvm/IR/MDBuilder.h" 31 #include "llvm/MC/MCSubtargetInfo.h" 32 #include <algorithm> 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "amdgpu-subtarget" 37 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #define AMDGPUSubtarget GCNSubtarget 41 #include "AMDGPUGenSubtargetInfo.inc" 42 #undef AMDGPUSubtarget 43 44 static cl::opt<bool> EnablePowerSched( 45 "amdgpu-enable-power-sched", 46 cl::desc("Enable scheduling to minimize mAI power bursts"), 47 cl::init(false)); 48 49 static cl::opt<bool> EnableVGPRIndexMode( 50 "amdgpu-vgpr-index-mode", 51 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 52 cl::init(false)); 53 54 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 55 cl::desc("Enable the use of AA during codegen."), 56 cl::init(true)); 57 58 static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold", 59 cl::desc("Number of addresses from which to enable MIMG NSA."), 60 cl::init(3), cl::Hidden); 61 62 GCNSubtarget::~GCNSubtarget() = default; 63 64 GCNSubtarget & 65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 66 StringRef GPU, StringRef FS) { 67 // Determine default and user-specified characteristics 68 // 69 // We want to be able to turn these off, but making this a subtarget feature 70 // for SI has the unhelpful behavior that it unsets everything else if you 71 // disable it. 72 // 73 // Similarly we want enable-prt-strict-null to be on by default and not to 74 // unset everything else if it is disabled 75 76 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 77 78 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 79 if (isAmdHsaOS()) 80 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 81 82 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 83 84 // Disable mutually exclusive bits. 85 if (FS.contains_insensitive("+wavefrontsize")) { 86 if (!FS.contains_insensitive("wavefrontsize16")) 87 FullFS += "-wavefrontsize16,"; 88 if (!FS.contains_insensitive("wavefrontsize32")) 89 FullFS += "-wavefrontsize32,"; 90 if (!FS.contains_insensitive("wavefrontsize64")) 91 FullFS += "-wavefrontsize64,"; 92 } 93 94 FullFS += FS; 95 96 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 97 98 // Implement the "generic" processors, which acts as the default when no 99 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 100 // the first amdgcn target that supports flat addressing. Other OSes defaults 101 // to the first amdgcn target. 102 if (Gen == AMDGPUSubtarget::INVALID) { 103 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 104 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 105 } 106 107 // We don't support FP64 for EG/NI atm. 108 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 109 110 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 111 // support flat operations, otherwise they cannot access a 64-bit global 112 // address space 113 assert(hasAddr64() || hasFlat()); 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 115 // that do not support ADDR64 variants of MUBUF instructions. Such targets 116 // cannot use a 64 bit offset with a MUBUF instruction to access the global 117 // address space 118 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 119 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 120 FlatForGlobal = true; 121 } 122 // Unless +-flat-for-global is specified, use MUBUF instructions for global 123 // address space access if flat operations are not available. 124 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 125 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 126 FlatForGlobal = false; 127 } 128 129 // Set defaults if needed. 130 if (MaxPrivateElementSize == 0) 131 MaxPrivateElementSize = 4; 132 133 if (LDSBankCount == 0) 134 LDSBankCount = 32; 135 136 if (TT.getArch() == Triple::amdgcn) { 137 if (LocalMemorySize == 0) 138 LocalMemorySize = 32768; 139 140 // Do something sensible for unspecified target. 141 if (!HasMovrel && !HasVGPRIndexMode) 142 HasMovrel = true; 143 } 144 145 AddressableLocalMemorySize = LocalMemorySize; 146 147 if (AMDGPU::isGFX10Plus(*this) && 148 !getFeatureBits().test(AMDGPU::FeatureCuMode)) 149 LocalMemorySize *= 2; 150 151 // Don't crash on invalid devices. 152 if (WavefrontSizeLog2 == 0) 153 WavefrontSizeLog2 = 5; 154 155 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 156 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 157 158 TargetID.setTargetIDFromFeaturesString(FS); 159 160 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 161 << TargetID.getXnackSetting() << '\n'); 162 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 163 << TargetID.getSramEccSetting() << '\n'); 164 165 return *this; 166 } 167 168 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {} 169 170 bool AMDGPUSubtarget::useRealTrue16Insts() const { 171 return hasTrue16BitInsts() && EnableRealTrue16Insts; 172 } 173 174 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 175 const GCNTargetMachine &TM) 176 : // clang-format off 177 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 178 AMDGPUSubtarget(TT), 179 TargetTriple(TT), 180 TargetID(*this), 181 InstrItins(getInstrItineraryForCPU(GPU)), 182 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 183 TLInfo(TM, *this), 184 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 185 // clang-format on 186 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 187 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); 188 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 189 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 190 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 191 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 192 InstSelector.reset(new AMDGPUInstructionSelector( 193 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 194 } 195 196 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 197 if (getGeneration() < GFX10) 198 return 1; 199 200 switch (Opcode) { 201 case AMDGPU::V_LSHLREV_B64_e64: 202 case AMDGPU::V_LSHLREV_B64_gfx10: 203 case AMDGPU::V_LSHLREV_B64_e64_gfx11: 204 case AMDGPU::V_LSHLREV_B64_e32_gfx12: 205 case AMDGPU::V_LSHLREV_B64_e64_gfx12: 206 case AMDGPU::V_LSHL_B64_e64: 207 case AMDGPU::V_LSHRREV_B64_e64: 208 case AMDGPU::V_LSHRREV_B64_gfx10: 209 case AMDGPU::V_LSHRREV_B64_e64_gfx11: 210 case AMDGPU::V_LSHRREV_B64_e64_gfx12: 211 case AMDGPU::V_LSHR_B64_e64: 212 case AMDGPU::V_ASHRREV_I64_e64: 213 case AMDGPU::V_ASHRREV_I64_gfx10: 214 case AMDGPU::V_ASHRREV_I64_e64_gfx11: 215 case AMDGPU::V_ASHRREV_I64_e64_gfx12: 216 case AMDGPU::V_ASHR_I64_e64: 217 return 1; 218 } 219 220 return 2; 221 } 222 223 /// This list was mostly derived from experimentation. 224 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 225 switch (Opcode) { 226 case AMDGPU::V_CVT_F16_F32_e32: 227 case AMDGPU::V_CVT_F16_F32_e64: 228 case AMDGPU::V_CVT_F16_U16_e32: 229 case AMDGPU::V_CVT_F16_U16_e64: 230 case AMDGPU::V_CVT_F16_I16_e32: 231 case AMDGPU::V_CVT_F16_I16_e64: 232 case AMDGPU::V_RCP_F16_e64: 233 case AMDGPU::V_RCP_F16_e32: 234 case AMDGPU::V_RSQ_F16_e64: 235 case AMDGPU::V_RSQ_F16_e32: 236 case AMDGPU::V_SQRT_F16_e64: 237 case AMDGPU::V_SQRT_F16_e32: 238 case AMDGPU::V_LOG_F16_e64: 239 case AMDGPU::V_LOG_F16_e32: 240 case AMDGPU::V_EXP_F16_e64: 241 case AMDGPU::V_EXP_F16_e32: 242 case AMDGPU::V_SIN_F16_e64: 243 case AMDGPU::V_SIN_F16_e32: 244 case AMDGPU::V_COS_F16_e64: 245 case AMDGPU::V_COS_F16_e32: 246 case AMDGPU::V_FLOOR_F16_e64: 247 case AMDGPU::V_FLOOR_F16_e32: 248 case AMDGPU::V_CEIL_F16_e64: 249 case AMDGPU::V_CEIL_F16_e32: 250 case AMDGPU::V_TRUNC_F16_e64: 251 case AMDGPU::V_TRUNC_F16_e32: 252 case AMDGPU::V_RNDNE_F16_e64: 253 case AMDGPU::V_RNDNE_F16_e32: 254 case AMDGPU::V_FRACT_F16_e64: 255 case AMDGPU::V_FRACT_F16_e32: 256 case AMDGPU::V_FREXP_MANT_F16_e64: 257 case AMDGPU::V_FREXP_MANT_F16_e32: 258 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 259 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 260 case AMDGPU::V_LDEXP_F16_e64: 261 case AMDGPU::V_LDEXP_F16_e32: 262 case AMDGPU::V_LSHLREV_B16_e64: 263 case AMDGPU::V_LSHLREV_B16_e32: 264 case AMDGPU::V_LSHRREV_B16_e64: 265 case AMDGPU::V_LSHRREV_B16_e32: 266 case AMDGPU::V_ASHRREV_I16_e64: 267 case AMDGPU::V_ASHRREV_I16_e32: 268 case AMDGPU::V_ADD_U16_e64: 269 case AMDGPU::V_ADD_U16_e32: 270 case AMDGPU::V_SUB_U16_e64: 271 case AMDGPU::V_SUB_U16_e32: 272 case AMDGPU::V_SUBREV_U16_e64: 273 case AMDGPU::V_SUBREV_U16_e32: 274 case AMDGPU::V_MUL_LO_U16_e64: 275 case AMDGPU::V_MUL_LO_U16_e32: 276 case AMDGPU::V_ADD_F16_e64: 277 case AMDGPU::V_ADD_F16_e32: 278 case AMDGPU::V_SUB_F16_e64: 279 case AMDGPU::V_SUB_F16_e32: 280 case AMDGPU::V_SUBREV_F16_e64: 281 case AMDGPU::V_SUBREV_F16_e32: 282 case AMDGPU::V_MUL_F16_e64: 283 case AMDGPU::V_MUL_F16_e32: 284 case AMDGPU::V_MAX_F16_e64: 285 case AMDGPU::V_MAX_F16_e32: 286 case AMDGPU::V_MIN_F16_e64: 287 case AMDGPU::V_MIN_F16_e32: 288 case AMDGPU::V_MAX_U16_e64: 289 case AMDGPU::V_MAX_U16_e32: 290 case AMDGPU::V_MIN_U16_e64: 291 case AMDGPU::V_MIN_U16_e32: 292 case AMDGPU::V_MAX_I16_e64: 293 case AMDGPU::V_MAX_I16_e32: 294 case AMDGPU::V_MIN_I16_e64: 295 case AMDGPU::V_MIN_I16_e32: 296 case AMDGPU::V_MAD_F16_e64: 297 case AMDGPU::V_MAD_U16_e64: 298 case AMDGPU::V_MAD_I16_e64: 299 case AMDGPU::V_FMA_F16_e64: 300 case AMDGPU::V_DIV_FIXUP_F16_e64: 301 // On gfx10, all 16-bit instructions preserve the high bits. 302 return getGeneration() <= AMDGPUSubtarget::GFX9; 303 case AMDGPU::V_MADAK_F16: 304 case AMDGPU::V_MADMK_F16: 305 case AMDGPU::V_MAC_F16_e64: 306 case AMDGPU::V_MAC_F16_e32: 307 case AMDGPU::V_FMAMK_F16: 308 case AMDGPU::V_FMAAK_F16: 309 case AMDGPU::V_FMAC_F16_e64: 310 case AMDGPU::V_FMAC_F16_e32: 311 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 312 // instructions maintain the legacy behavior of 0ing. Some instructions 313 // changed to preserving the high bits. 314 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 315 case AMDGPU::V_MAD_MIXLO_F16: 316 case AMDGPU::V_MAD_MIXHI_F16: 317 default: 318 return false; 319 } 320 } 321 322 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still 323 // allows the given function to achieve an occupancy of NWaves waves per 324 // SIMD / EU, taking into account only the function's *maximum* workgroup size. 325 unsigned 326 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 327 const Function &F) const { 328 const unsigned WaveSize = getWavefrontSize(); 329 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 330 const unsigned WavesPerWorkgroup = 331 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize); 332 333 const unsigned WorkGroupsPerCU = 334 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup); 335 336 return getLocalMemorySize() / WorkGroupsPerCU; 337 } 338 339 // FIXME: Should return min,max range. 340 // 341 // Returns the maximum occupancy, in number of waves per SIMD / EU, that can 342 // be achieved when only the given function is running on the machine; and 343 // taking into account the overall number of wave slots, the (maximum) workgroup 344 // size, and the per-workgroup LDS allocation size. 345 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 346 const Function &F) const { 347 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 348 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 349 if (!MaxWorkGroupsPerCu) 350 return 0; 351 352 const unsigned WaveSize = getWavefrontSize(); 353 354 // FIXME: Do we need to account for alignment requirement of LDS rounding the 355 // size up? 356 // Compute restriction based on LDS usage 357 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 358 359 // This can be queried with more LDS than is possible, so just assume the 360 // worst. 361 if (NumGroups == 0) 362 return 1; 363 364 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 365 366 // Round to the number of waves per CU. 367 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize); 368 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 369 370 // Number of waves per EU (SIMD). 371 MaxWaves = divideCeil(MaxWaves, getEUsPerCU()); 372 373 // Clamp to the maximum possible number of waves. 374 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 375 376 // FIXME: Needs to be a multiple of the group size? 377 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 378 379 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 380 "computed invalid occupancy"); 381 return MaxWaves; 382 } 383 384 unsigned 385 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 386 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 387 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 388 } 389 390 std::pair<unsigned, unsigned> 391 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 392 switch (CC) { 393 case CallingConv::AMDGPU_VS: 394 case CallingConv::AMDGPU_LS: 395 case CallingConv::AMDGPU_HS: 396 case CallingConv::AMDGPU_ES: 397 case CallingConv::AMDGPU_GS: 398 case CallingConv::AMDGPU_PS: 399 return std::pair(1, getWavefrontSize()); 400 default: 401 return std::pair(1u, getMaxFlatWorkGroupSize()); 402 } 403 } 404 405 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 406 const Function &F) const { 407 // Default minimum/maximum flat work group sizes. 408 std::pair<unsigned, unsigned> Default = 409 getDefaultFlatWorkGroupSize(F.getCallingConv()); 410 411 // Requested minimum/maximum flat work group sizes. 412 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 413 F, "amdgpu-flat-work-group-size", Default); 414 415 // Make sure requested minimum is less than requested maximum. 416 if (Requested.first > Requested.second) 417 return Default; 418 419 // Make sure requested values do not violate subtarget's specifications. 420 if (Requested.first < getMinFlatWorkGroupSize()) 421 return Default; 422 if (Requested.second > getMaxFlatWorkGroupSize()) 423 return Default; 424 425 return Requested; 426 } 427 428 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( 429 std::pair<unsigned, unsigned> Requested, 430 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 431 // Default minimum/maximum number of waves per execution unit. 432 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 433 434 // If minimum/maximum flat work group sizes were explicitly requested using 435 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 436 // number of waves per execution unit to values implied by requested 437 // minimum/maximum flat work group sizes. 438 unsigned MinImpliedByFlatWorkGroupSize = 439 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 440 Default.first = MinImpliedByFlatWorkGroupSize; 441 442 // Make sure requested minimum is less than requested maximum. 443 if (Requested.second && Requested.first > Requested.second) 444 return Default; 445 446 // Make sure requested values do not violate subtarget's specifications. 447 if (Requested.first < getMinWavesPerEU() || 448 Requested.second > getMaxWavesPerEU()) 449 return Default; 450 451 // Make sure requested values are compatible with values implied by requested 452 // minimum/maximum flat work group sizes. 453 if (Requested.first < MinImpliedByFlatWorkGroupSize) 454 return Default; 455 456 return Requested; 457 } 458 459 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 460 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 461 // Default minimum/maximum number of waves per execution unit. 462 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 463 464 // Requested minimum/maximum number of waves per execution unit. 465 std::pair<unsigned, unsigned> Requested = 466 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); 467 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); 468 } 469 470 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 471 auto Node = Kernel.getMetadata("reqd_work_group_size"); 472 if (Node && Node->getNumOperands() == 3) 473 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 474 return std::numeric_limits<unsigned>::max(); 475 } 476 477 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 478 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 479 } 480 481 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 482 unsigned Dimension) const { 483 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 484 if (ReqdSize != std::numeric_limits<unsigned>::max()) 485 return ReqdSize - 1; 486 return getFlatWorkGroupSizes(Kernel).second - 1; 487 } 488 489 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { 490 for (int I = 0; I < 3; ++I) { 491 if (getMaxWorkitemID(Func, I) > 0) 492 return false; 493 } 494 495 return true; 496 } 497 498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 499 Function *Kernel = I->getParent()->getParent(); 500 unsigned MinSize = 0; 501 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 502 bool IdQuery = false; 503 504 // If reqd_work_group_size is present it narrows value down. 505 if (auto *CI = dyn_cast<CallInst>(I)) { 506 const Function *F = CI->getCalledFunction(); 507 if (F) { 508 unsigned Dim = UINT_MAX; 509 switch (F->getIntrinsicID()) { 510 case Intrinsic::amdgcn_workitem_id_x: 511 case Intrinsic::r600_read_tidig_x: 512 IdQuery = true; 513 [[fallthrough]]; 514 case Intrinsic::r600_read_local_size_x: 515 Dim = 0; 516 break; 517 case Intrinsic::amdgcn_workitem_id_y: 518 case Intrinsic::r600_read_tidig_y: 519 IdQuery = true; 520 [[fallthrough]]; 521 case Intrinsic::r600_read_local_size_y: 522 Dim = 1; 523 break; 524 case Intrinsic::amdgcn_workitem_id_z: 525 case Intrinsic::r600_read_tidig_z: 526 IdQuery = true; 527 [[fallthrough]]; 528 case Intrinsic::r600_read_local_size_z: 529 Dim = 2; 530 break; 531 default: 532 break; 533 } 534 535 if (Dim <= 3) { 536 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 537 if (ReqdSize != std::numeric_limits<unsigned>::max()) 538 MinSize = MaxSize = ReqdSize; 539 } 540 } 541 } 542 543 if (!MaxSize) 544 return false; 545 546 // Range metadata is [Lo, Hi). For ID query we need to pass max size 547 // as Hi. For size query we need to pass Hi + 1. 548 if (IdQuery) 549 MinSize = 0; 550 else 551 ++MaxSize; 552 553 MDBuilder MDB(I->getContext()); 554 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 555 APInt(32, MaxSize)); 556 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 557 return true; 558 } 559 560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 561 assert(AMDGPU::isKernel(F.getCallingConv())); 562 563 // We don't allocate the segment if we know the implicit arguments weren't 564 // used, even if the ABI implies we need them. 565 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 566 return 0; 567 568 if (isMesaKernel(F)) 569 return 16; 570 571 // Assume all implicit inputs are used by default 572 const Module *M = F.getParent(); 573 unsigned NBytes = 574 AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; 575 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", 576 NBytes); 577 } 578 579 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 580 Align &MaxAlign) const { 581 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 582 F.getCallingConv() == CallingConv::SPIR_KERNEL); 583 584 const DataLayout &DL = F.getParent()->getDataLayout(); 585 uint64_t ExplicitArgBytes = 0; 586 MaxAlign = Align(1); 587 588 for (const Argument &Arg : F.args()) { 589 const bool IsByRef = Arg.hasByRefAttr(); 590 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 591 Align Alignment = DL.getValueOrABITypeAlignment( 592 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy); 593 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 594 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 595 MaxAlign = std::max(MaxAlign, Alignment); 596 } 597 598 return ExplicitArgBytes; 599 } 600 601 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 602 Align &MaxAlign) const { 603 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && 604 F.getCallingConv() != CallingConv::SPIR_KERNEL) 605 return 0; 606 607 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 608 609 unsigned ExplicitOffset = getExplicitKernelArgOffset(); 610 611 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 612 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 613 if (ImplicitBytes != 0) { 614 const Align Alignment = getAlignmentForImplicitArgPtr(); 615 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 616 MaxAlign = std::max(MaxAlign, Alignment); 617 } 618 619 // Being able to dereference past the end is useful for emitting scalar loads. 620 return alignTo(TotalSize, 4); 621 } 622 623 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 624 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 625 : AMDGPUDwarfFlavour::Wave64; 626 } 627 628 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 629 unsigned NumRegionInstrs) const { 630 // Track register pressure so the scheduler can try to decrease 631 // pressure once register usage is above the threshold defined by 632 // SIRegisterInfo::getRegPressureSetLimit() 633 Policy.ShouldTrackPressure = true; 634 635 // Enabling both top down and bottom up scheduling seems to give us less 636 // register spills than just using one of these approaches on its own. 637 Policy.OnlyTopDown = false; 638 Policy.OnlyBottomUp = false; 639 640 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 641 if (!enableSIScheduler()) 642 Policy.ShouldTrackLaneMasks = true; 643 } 644 645 bool GCNSubtarget::hasMadF16() const { 646 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 647 } 648 649 bool GCNSubtarget::useVGPRIndexMode() const { 650 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 651 } 652 653 bool GCNSubtarget::useAA() const { return UseAA; } 654 655 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 656 if (getGeneration() >= AMDGPUSubtarget::GFX10) 657 return getMaxWavesPerEU(); 658 659 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 660 if (SGPRs <= 80) 661 return 10; 662 if (SGPRs <= 88) 663 return 9; 664 if (SGPRs <= 100) 665 return 8; 666 return 7; 667 } 668 if (SGPRs <= 48) 669 return 10; 670 if (SGPRs <= 56) 671 return 9; 672 if (SGPRs <= 64) 673 return 8; 674 if (SGPRs <= 72) 675 return 7; 676 if (SGPRs <= 80) 677 return 6; 678 return 5; 679 } 680 681 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { 682 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); 683 } 684 685 unsigned 686 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 687 if (getGeneration() >= AMDGPUSubtarget::GFX10) 688 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 689 690 if (HasFlatScratch || HasArchitectedFlatScratch) { 691 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 692 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 693 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 694 return 4; // FLAT_SCRATCH, VCC (in that order). 695 } 696 697 if (isXNACKEnabled()) 698 return 4; // XNACK, VCC (in that order). 699 return 2; // VCC. 700 } 701 702 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 703 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 704 return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); 705 } 706 707 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 708 // In principle we do not need to reserve SGPR pair used for flat_scratch if 709 // we know flat instructions do not access the stack anywhere in the 710 // program. For now assume it's needed if we have flat instructions. 711 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 712 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 713 } 714 715 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 716 unsigned NumSGPRs, 717 unsigned NumVGPRs) const { 718 unsigned Occupancy = 719 std::min(getMaxWavesPerEU(), 720 getOccupancyWithLocalMemSize(LDSSize, F)); 721 if (NumSGPRs) 722 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 723 if (NumVGPRs) 724 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 725 return Occupancy; 726 } 727 728 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 729 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 730 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 731 // Compute maximum number of SGPRs function can use using default/requested 732 // minimum number of waves per execution unit. 733 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 734 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 735 736 // Check if maximum number of SGPRs was explicitly requested using 737 // "amdgpu-num-sgpr" attribute. 738 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 739 unsigned Requested = 740 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); 741 742 // Make sure requested value does not violate subtarget's specifications. 743 if (Requested && (Requested <= ReservedNumSGPRs)) 744 Requested = 0; 745 746 // If more SGPRs are required to support the input user/system SGPRs, 747 // increase to accommodate them. 748 // 749 // FIXME: This really ends up using the requested number of SGPRs + number 750 // of reserved special registers in total. Theoretically you could re-use 751 // the last input registers for these special registers, but this would 752 // require a lot of complexity to deal with the weird aliasing. 753 unsigned InputNumSGPRs = PreloadedSGPRs; 754 if (Requested && Requested < InputNumSGPRs) 755 Requested = InputNumSGPRs; 756 757 // Make sure requested value is compatible with values implied by 758 // default/requested minimum/maximum number of waves per execution unit. 759 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 760 Requested = 0; 761 if (WavesPerEU.second && 762 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 763 Requested = 0; 764 765 if (Requested) 766 MaxNumSGPRs = Requested; 767 } 768 769 if (hasSGPRInitBug()) 770 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 771 772 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 773 } 774 775 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 776 const Function &F = MF.getFunction(); 777 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 778 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 779 getReservedNumSGPRs(MF)); 780 } 781 782 static unsigned getMaxNumPreloadedSGPRs() { 783 using USI = GCNUserSGPRUsageInfo; 784 // Max number of user SGPRs 785 const unsigned MaxUserSGPRs = 786 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + 787 USI::getNumUserSGPRForField(USI::DispatchPtrID) + 788 USI::getNumUserSGPRForField(USI::QueuePtrID) + 789 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + 790 USI::getNumUserSGPRForField(USI::DispatchIdID) + 791 USI::getNumUserSGPRForField(USI::FlatScratchInitID) + 792 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); 793 794 // Max number of system SGPRs 795 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 796 1 + // WorkGroupIDY 797 1 + // WorkGroupIDZ 798 1 + // WorkGroupInfo 799 1; // private segment wave byte offset 800 801 // Max number of synthetic SGPRs 802 const unsigned SyntheticSGPRs = 1; // LDSKernelId 803 804 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; 805 } 806 807 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 808 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 809 getReservedNumSGPRs(F)); 810 } 811 812 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 813 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 814 // Compute maximum number of VGPRs function can use using default/requested 815 // minimum number of waves per execution unit. 816 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 817 818 // Check if maximum number of VGPRs was explicitly requested using 819 // "amdgpu-num-vgpr" attribute. 820 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 821 unsigned Requested = 822 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); 823 824 if (hasGFX90AInsts()) 825 Requested *= 2; 826 827 // Make sure requested value is compatible with values implied by 828 // default/requested minimum/maximum number of waves per execution unit. 829 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 830 Requested = 0; 831 if (WavesPerEU.second && 832 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 833 Requested = 0; 834 835 if (Requested) 836 MaxNumVGPRs = Requested; 837 } 838 839 return MaxNumVGPRs; 840 } 841 842 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 843 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 844 } 845 846 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 847 const Function &F = MF.getFunction(); 848 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 849 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 850 } 851 852 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 853 int UseOpIdx, SDep &Dep) const { 854 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 855 !Def->isInstr() || !Use->isInstr()) 856 return; 857 858 MachineInstr *DefI = Def->getInstr(); 859 MachineInstr *UseI = Use->getInstr(); 860 861 if (DefI->isBundle()) { 862 const SIRegisterInfo *TRI = getRegisterInfo(); 863 auto Reg = Dep.getReg(); 864 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 865 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 866 unsigned Lat = 0; 867 for (++I; I != E && I->isBundledWithPred(); ++I) { 868 if (I->modifiesRegister(Reg, TRI)) 869 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 870 else if (Lat) 871 --Lat; 872 } 873 Dep.setLatency(Lat); 874 } else if (UseI->isBundle()) { 875 const SIRegisterInfo *TRI = getRegisterInfo(); 876 auto Reg = Dep.getReg(); 877 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 878 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 879 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 880 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 881 if (I->readsRegister(Reg, TRI)) 882 break; 883 --Lat; 884 } 885 Dep.setLatency(Lat); 886 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 887 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 888 // implicit operands which come from the MCInstrDesc, which can fool 889 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 890 // pseudo operands. 891 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 892 DefI, DefOpIdx, UseI, UseOpIdx)); 893 } 894 } 895 896 namespace { 897 struct FillMFMAShadowMutation : ScheduleDAGMutation { 898 const SIInstrInfo *TII; 899 900 ScheduleDAGMI *DAG; 901 902 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 903 904 bool isSALU(const SUnit *SU) const { 905 const MachineInstr *MI = SU->getInstr(); 906 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 907 } 908 909 bool isVALU(const SUnit *SU) const { 910 const MachineInstr *MI = SU->getInstr(); 911 return MI && TII->isVALU(*MI); 912 } 913 914 // Link as many SALU instructions in chain as possible. Return the size 915 // of the chain. Links up to MaxChain instructions. 916 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 917 SmallPtrSetImpl<SUnit *> &Visited) const { 918 SmallVector<SUnit *, 8> Worklist({To}); 919 unsigned Linked = 0; 920 921 while (!Worklist.empty() && MaxChain-- > 0) { 922 SUnit *SU = Worklist.pop_back_val(); 923 if (!Visited.insert(SU).second) 924 continue; 925 926 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 927 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 928 929 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) 930 if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) 931 ++Linked; 932 933 for (SDep &SI : From->Succs) { 934 SUnit *SUv = SI.getSUnit(); 935 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && 936 DAG->canAddEdge(SUv, SU)) 937 DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); 938 } 939 940 for (SDep &SI : SU->Succs) { 941 SUnit *Succ = SI.getSUnit(); 942 if (Succ != SU && isSALU(Succ)) 943 Worklist.push_back(Succ); 944 } 945 } 946 947 return Linked; 948 } 949 950 void apply(ScheduleDAGInstrs *DAGInstrs) override { 951 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 952 if (!ST.hasMAIInsts()) 953 return; 954 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 955 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 956 if (!TSchedModel || DAG->SUnits.empty()) 957 return; 958 959 // Scan for MFMA long latency instructions and try to add a dependency 960 // of available SALU instructions to give them a chance to fill MFMA 961 // shadow. That is desirable to fill MFMA shadow with SALU instructions 962 // rather than VALU to prevent power consumption bursts and throttle. 963 auto LastSALU = DAG->SUnits.begin(); 964 auto E = DAG->SUnits.end(); 965 SmallPtrSet<SUnit*, 32> Visited; 966 for (SUnit &SU : DAG->SUnits) { 967 MachineInstr &MAI = *SU.getInstr(); 968 if (!TII->isMAI(MAI) || 969 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 970 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 971 continue; 972 973 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 974 975 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 976 dbgs() << "Need " << Lat 977 << " instructions to cover latency.\n"); 978 979 // Find up to Lat independent scalar instructions as early as 980 // possible such that they can be scheduled after this MFMA. 981 for ( ; Lat && LastSALU != E; ++LastSALU) { 982 if (Visited.count(&*LastSALU)) 983 continue; 984 985 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || 986 !DAG->canAddEdge(&*LastSALU, &SU)) 987 continue; 988 989 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 990 } 991 } 992 } 993 }; 994 } // namespace 995 996 void GCNSubtarget::getPostRAMutations( 997 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 998 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 999 } 1000 1001 std::unique_ptr<ScheduleDAGMutation> 1002 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1003 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo) 1004 : nullptr; 1005 } 1006 1007 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { 1008 if (getGeneration() >= AMDGPUSubtarget::GFX12) 1009 return 0; // Not MIMG encoding. 1010 1011 if (NSAThreshold.getNumOccurrences() > 0) 1012 return std::max(NSAThreshold.getValue(), 2u); 1013 1014 int Value = MF.getFunction().getFnAttributeAsParsedInteger( 1015 "amdgpu-nsa-threshold", -1); 1016 if (Value > 0) 1017 return std::max(Value, 2); 1018 1019 return 3; 1020 } 1021 1022 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1023 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1024 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1025 else 1026 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1027 } 1028 1029 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1030 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1031 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1032 else 1033 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1034 } 1035 1036 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, 1037 const GCNSubtarget &ST) 1038 : ST(ST) { 1039 const CallingConv::ID CC = F.getCallingConv(); 1040 const bool IsKernel = 1041 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; 1042 // FIXME: Should have analysis or something rather than attribute to detect 1043 // calls. 1044 const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); 1045 // FIXME: This attribute is a hack, we just need an analysis on the function 1046 // to look for allocas. 1047 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); 1048 1049 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) 1050 KernargSegmentPtr = true; 1051 1052 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); 1053 if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) 1054 PrivateSegmentBuffer = true; 1055 else if (ST.isMesaGfxShader(F)) 1056 ImplicitBufferPtr = true; 1057 1058 if (!AMDGPU::isGraphics(CC)) { 1059 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) 1060 DispatchPtr = true; 1061 1062 // FIXME: Can this always be disabled with < COv5? 1063 if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) 1064 QueuePtr = true; 1065 1066 if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) 1067 DispatchID = true; 1068 } 1069 1070 // TODO: This could be refined a lot. The attribute is a poor way of 1071 // detecting calls or stack objects that may require it before argument 1072 // lowering. 1073 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && 1074 (IsAmdHsaOrMesa || ST.enableFlatScratch()) && 1075 (HasCalls || HasStackObjects || ST.enableFlatScratch()) && 1076 !ST.flatScratchIsArchitected()) { 1077 FlatScratchInit = true; 1078 } 1079 1080 if (hasImplicitBufferPtr()) 1081 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); 1082 1083 if (hasPrivateSegmentBuffer()) 1084 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); 1085 1086 if (hasDispatchPtr()) 1087 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); 1088 1089 if (hasQueuePtr()) 1090 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); 1091 1092 if (hasKernargSegmentPtr()) 1093 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); 1094 1095 if (hasDispatchID()) 1096 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); 1097 1098 if (hasFlatScratchInit()) 1099 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); 1100 } 1101 1102 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { 1103 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); 1104 NumKernargPreloadSGPRs += NumSGPRs; 1105 NumUsedUserSGPRs += NumSGPRs; 1106 } 1107 1108 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { 1109 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; 1110 } 1111