1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "SIFrameLowering.h" 21 #include "SIISelLowering.h" 22 #include "SIInstrInfo.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/Support/ErrorHandling.h" 25 26 #define GET_SUBTARGETINFO_HEADER 27 #include "AMDGPUGenSubtargetInfo.inc" 28 29 namespace llvm { 30 31 class GCNTargetMachine; 32 33 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 34 public AMDGPUSubtarget { 35 public: 36 using AMDGPUSubtarget::getMaxWavesPerEU; 37 38 // Following 2 enums are documented at: 39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 40 enum class TrapHandlerAbi { 41 NONE = 0x00, 42 AMDHSA = 0x01, 43 }; 44 45 enum class TrapID { 46 LLVMAMDHSATrap = 0x02, 47 LLVMAMDHSADebugTrap = 0x03, 48 }; 49 50 private: 51 /// SelectionDAGISel related APIs. 52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; 53 54 /// GlobalISel related APIs. 55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 57 std::unique_ptr<InstructionSelector> InstSelector; 58 std::unique_ptr<LegalizerInfo> Legalizer; 59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; 60 61 protected: 62 // Basic subtarget description. 63 Triple TargetTriple; 64 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 65 unsigned Gen = INVALID; 66 InstrItineraryData InstrItins; 67 int LDSBankCount = 0; 68 unsigned MaxPrivateElementSize = 0; 69 70 // Possibly statically set by tablegen, but may want to be overridden. 71 bool FastDenormalF32 = false; 72 bool HalfRate64Ops = false; 73 bool FullRate64Ops = false; 74 75 // Dynamically set bits that enable features. 76 bool FlatForGlobal = false; 77 bool AutoWaitcntBeforeBarrier = false; 78 bool BackOffBarrier = false; 79 bool UnalignedScratchAccess = false; 80 bool UnalignedAccessMode = false; 81 bool RelaxedBufferOOBMode = false; 82 bool HasApertureRegs = false; 83 bool SupportsXNACK = false; 84 bool KernargPreload = false; 85 86 // This should not be used directly. 'TargetID' tracks the dynamic settings 87 // for XNACK. 88 bool EnableXNACK = false; 89 90 bool EnableTgSplit = false; 91 bool EnableCuMode = false; 92 bool TrapHandler = false; 93 bool EnablePreciseMemory = false; 94 95 // Used as options. 96 bool EnableLoadStoreOpt = false; 97 bool EnableUnsafeDSOffsetFolding = false; 98 bool EnableSIScheduler = false; 99 bool EnableDS128 = false; 100 bool EnablePRTStrictNull = false; 101 bool DumpCode = false; 102 103 // Subtarget statically properties set by tablegen 104 bool FP64 = false; 105 bool FMA = false; 106 bool MIMG_R128 = false; 107 bool CIInsts = false; 108 bool GFX8Insts = false; 109 bool GFX9Insts = false; 110 bool GFX90AInsts = false; 111 bool GFX940Insts = false; 112 bool GFX950Insts = false; 113 bool GFX10Insts = false; 114 bool GFX11Insts = false; 115 bool GFX12Insts = false; 116 bool GFX1250Insts = false; 117 bool GFX10_3Insts = false; 118 bool GFX7GFX8GFX9Insts = false; 119 bool SGPRInitBug = false; 120 bool UserSGPRInit16Bug = false; 121 bool NegativeScratchOffsetBug = false; 122 bool NegativeUnalignedScratchOffsetBug = false; 123 bool HasSMemRealTime = false; 124 bool HasIntClamp = false; 125 bool HasFmaMixInsts = false; 126 bool HasMovrel = false; 127 bool HasVGPRIndexMode = false; 128 bool HasScalarDwordx3Loads = false; 129 bool HasScalarStores = false; 130 bool HasScalarAtomics = false; 131 bool HasSDWAOmod = false; 132 bool HasSDWAScalar = false; 133 bool HasSDWASdst = false; 134 bool HasSDWAMac = false; 135 bool HasSDWAOutModsVOPC = false; 136 bool HasDPP = false; 137 bool HasDPP8 = false; 138 bool HasDPALU_DPP = false; 139 bool HasDPPSrc1SGPR = false; 140 bool HasPackedFP32Ops = false; 141 bool HasImageInsts = false; 142 bool HasExtendedImageInsts = false; 143 bool HasR128A16 = false; 144 bool HasA16 = false; 145 bool HasG16 = false; 146 bool HasNSAEncoding = false; 147 bool HasPartialNSAEncoding = false; 148 bool GFX10_AEncoding = false; 149 bool GFX10_BEncoding = false; 150 bool HasDLInsts = false; 151 bool HasFmacF64Inst = false; 152 bool HasDot1Insts = false; 153 bool HasDot2Insts = false; 154 bool HasDot3Insts = false; 155 bool HasDot4Insts = false; 156 bool HasDot5Insts = false; 157 bool HasDot6Insts = false; 158 bool HasDot7Insts = false; 159 bool HasDot8Insts = false; 160 bool HasDot9Insts = false; 161 bool HasDot10Insts = false; 162 bool HasDot11Insts = false; 163 bool HasDot12Insts = false; 164 bool HasDot13Insts = false; 165 bool HasMAIInsts = false; 166 bool HasFP8Insts = false; 167 bool HasFP8ConversionInsts = false; 168 bool HasFP8E5M3Insts = false; 169 bool HasCvtFP8Vop1Bug = false; 170 bool HasPkFmacF16Inst = false; 171 bool HasAtomicFMinFMaxF32GlobalInsts = false; 172 bool HasAtomicFMinFMaxF64GlobalInsts = false; 173 bool HasAtomicFMinFMaxF32FlatInsts = false; 174 bool HasAtomicFMinFMaxF64FlatInsts = false; 175 bool HasAtomicDsPkAdd16Insts = false; 176 bool HasAtomicFlatPkAdd16Insts = false; 177 bool HasAtomicFaddRtnInsts = false; 178 bool HasAtomicFaddNoRtnInsts = false; 179 bool HasMemoryAtomicFaddF32DenormalSupport = false; 180 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; 181 bool HasAtomicBufferGlobalPkAddF16Insts = false; 182 bool HasAtomicCSubNoRtnInsts = false; 183 bool HasAtomicGlobalPkAddBF16Inst = false; 184 bool HasAtomicBufferPkAddBF16Inst = false; 185 bool HasFlatAtomicFaddF32Inst = false; 186 bool HasFlatBufferGlobalAtomicFaddF64Inst = false; 187 bool HasDefaultComponentZero = false; 188 bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false; 189 bool HasDefaultComponentBroadcast = false; 190 bool HasXF32Insts = false; 191 /// The maximum number of instructions that may be placed within an S_CLAUSE, 192 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0 193 /// indicates a lack of S_CLAUSE support. 194 unsigned MaxHardClauseLength = 0; 195 bool SupportsSRAMECC = false; 196 bool DynamicVGPR = false; 197 bool DynamicVGPRBlockSize32 = false; 198 bool HasVMemToLDSLoad = false; 199 200 // This should not be used directly. 'TargetID' tracks the dynamic settings 201 // for SRAMECC. 202 bool EnableSRAMECC = false; 203 204 bool HasNoSdstCMPX = false; 205 bool HasVscnt = false; 206 bool HasWaitXcnt = false; 207 bool HasGetWaveIdInst = false; 208 bool HasSMemTimeInst = false; 209 bool HasShaderCyclesRegister = false; 210 bool HasShaderCyclesHiLoRegisters = false; 211 bool HasVOP3Literal = false; 212 bool HasNoDataDepHazard = false; 213 bool FlatAddressSpace = false; 214 bool FlatInstOffsets = false; 215 bool FlatGlobalInsts = false; 216 bool FlatScratchInsts = false; 217 bool ScalarFlatScratchInsts = false; 218 bool HasArchitectedFlatScratch = false; 219 bool EnableFlatScratch = false; 220 bool HasArchitectedSGPRs = false; 221 bool HasGDS = false; 222 bool HasGWS = false; 223 bool AddNoCarryInsts = false; 224 bool HasUnpackedD16VMem = false; 225 bool LDSMisalignedBug = false; 226 bool HasMFMAInlineLiteralBug = false; 227 bool UnalignedBufferAccess = false; 228 bool UnalignedDSAccess = false; 229 bool HasPackedTID = false; 230 bool ScalarizeGlobal = false; 231 bool HasSALUFloatInsts = false; 232 bool HasPseudoScalarTrans = false; 233 bool HasRestrictedSOffset = false; 234 bool Has64BitLiterals = false; 235 bool HasBitOp3Insts = false; 236 bool HasTransposeLoadF4F6Insts = false; 237 bool HasPrngInst = false; 238 bool HasBVHDualAndBVH8Insts = false; 239 bool HasPermlane16Swap = false; 240 bool HasPermlane32Swap = false; 241 bool HasVcmpxPermlaneHazard = false; 242 bool HasVMEMtoScalarWriteHazard = false; 243 bool HasSMEMtoVectorWriteHazard = false; 244 bool HasInstFwdPrefetchBug = false; 245 bool HasSafeSmemPrefetch = false; 246 bool HasVcmpxExecWARHazard = false; 247 bool HasLdsBranchVmemWARHazard = false; 248 bool HasNSAtoVMEMBug = false; 249 bool HasNSAClauseBug = false; 250 bool HasOffset3fBug = false; 251 bool HasFlatSegmentOffsetBug = false; 252 bool HasImageStoreD16Bug = false; 253 bool HasImageGather4D16Bug = false; 254 bool HasMSAALoadDstSelBug = false; 255 bool HasPrivEnabledTrap2NopBug = false; 256 bool Has1_5xVGPRs = false; 257 bool HasMADIntraFwdBug = false; 258 bool HasVOPDInsts = false; 259 bool HasVALUTransUseHazard = false; 260 bool HasRequiredExportPriority = false; 261 bool HasVmemWriteVgprInOrder = false; 262 bool HasAshrPkInsts = false; 263 bool HasIEEEMinimumMaximumInsts = false; 264 bool HasMinimum3Maximum3F32 = false; 265 bool HasMinimum3Maximum3F16 = false; 266 bool HasMinimum3Maximum3PKF16 = false; 267 bool HasLshlAddU64Inst = false; 268 bool HasPointSampleAccel = false; 269 bool HasLdsBarrierArriveAtomic = false; 270 bool HasSetPrioIncWgInst = false; 271 272 bool RequiresCOV6 = false; 273 bool UseBlockVGPROpsForCSR = false; 274 275 // Dummy feature to use for assembler in tablegen. 276 bool FeatureDisable = false; 277 278 private: 279 SIInstrInfo InstrInfo; 280 SITargetLowering TLInfo; 281 SIFrameLowering FrameLowering; 282 283 public: 284 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 285 const GCNTargetMachine &TM); 286 ~GCNSubtarget() override; 287 288 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 289 StringRef GPU, StringRef FS); 290 291 /// Diagnose inconsistent subtarget features before attempting to codegen 292 /// function \p F. 293 void checkSubtargetFeatures(const Function &F) const; 294 getInstrInfo()295 const SIInstrInfo *getInstrInfo() const override { 296 return &InstrInfo; 297 } 298 getFrameLowering()299 const SIFrameLowering *getFrameLowering() const override { 300 return &FrameLowering; 301 } 302 getTargetLowering()303 const SITargetLowering *getTargetLowering() const override { 304 return &TLInfo; 305 } 306 getRegisterInfo()307 const SIRegisterInfo *getRegisterInfo() const override { 308 return &InstrInfo.getRegisterInfo(); 309 } 310 311 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; 312 getCallLowering()313 const CallLowering *getCallLowering() const override { 314 return CallLoweringInfo.get(); 315 } 316 getInlineAsmLowering()317 const InlineAsmLowering *getInlineAsmLowering() const override { 318 return InlineAsmLoweringInfo.get(); 319 } 320 getInstructionSelector()321 InstructionSelector *getInstructionSelector() const override { 322 return InstSelector.get(); 323 } 324 getLegalizerInfo()325 const LegalizerInfo *getLegalizerInfo() const override { 326 return Legalizer.get(); 327 } 328 getRegBankInfo()329 const AMDGPURegisterBankInfo *getRegBankInfo() const override { 330 return RegBankInfo.get(); 331 } 332 getTargetID()333 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 334 return TargetID; 335 } 336 getInstrItineraryData()337 const InstrItineraryData *getInstrItineraryData() const override { 338 return &InstrItins; 339 } 340 341 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 342 getGeneration()343 Generation getGeneration() const { 344 return (Generation)Gen; 345 } 346 getMaxWaveScratchSize()347 unsigned getMaxWaveScratchSize() const { 348 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 349 if (getGeneration() >= GFX12) { 350 // 18-bit field in units of 64-dword. 351 return (64 * 4) * ((1 << 18) - 1); 352 } 353 if (getGeneration() == GFX11) { 354 // 15-bit field in units of 64-dword. 355 return (64 * 4) * ((1 << 15) - 1); 356 } 357 // 13-bit field in units of 256-dword. 358 return (256 * 4) * ((1 << 13) - 1); 359 } 360 361 /// Return the number of high bits known to be zero for a frame index. getKnownHighZeroBitsForFrameIndex()362 unsigned getKnownHighZeroBitsForFrameIndex() const { 363 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 364 } 365 getLDSBankCount()366 int getLDSBankCount() const { 367 return LDSBankCount; 368 } 369 370 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 371 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 372 } 373 374 unsigned getConstantBusLimit(unsigned Opcode) const; 375 376 /// Returns if the result of this instruction with a 16-bit result returned in 377 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 378 /// the original value. 379 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 380 supportsWGP()381 bool supportsWGP() const { return getGeneration() >= GFX10; } 382 hasIntClamp()383 bool hasIntClamp() const { 384 return HasIntClamp; 385 } 386 hasFP64()387 bool hasFP64() const { 388 return FP64; 389 } 390 hasMIMG_R128()391 bool hasMIMG_R128() const { 392 return MIMG_R128; 393 } 394 hasHWFP64()395 bool hasHWFP64() const { 396 return FP64; 397 } 398 hasHalfRate64Ops()399 bool hasHalfRate64Ops() const { 400 return HalfRate64Ops; 401 } 402 hasFullRate64Ops()403 bool hasFullRate64Ops() const { 404 return FullRate64Ops; 405 } 406 hasAddr64()407 bool hasAddr64() const { 408 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 409 } 410 hasFlat()411 bool hasFlat() const { 412 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 413 } 414 415 // Return true if the target only has the reverse operand versions of VALU 416 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). hasOnlyRevVALUShifts()417 bool hasOnlyRevVALUShifts() const { 418 return getGeneration() >= VOLCANIC_ISLANDS; 419 } 420 hasFractBug()421 bool hasFractBug() const { 422 return getGeneration() == SOUTHERN_ISLANDS; 423 } 424 hasBFE()425 bool hasBFE() const { 426 return true; 427 } 428 hasBFI()429 bool hasBFI() const { 430 return true; 431 } 432 hasBFM()433 bool hasBFM() const { 434 return hasBFE(); 435 } 436 hasBCNT(unsigned Size)437 bool hasBCNT(unsigned Size) const { 438 return true; 439 } 440 hasFFBL()441 bool hasFFBL() const { 442 return true; 443 } 444 hasFFBH()445 bool hasFFBH() const { 446 return true; 447 } 448 hasMed3_16()449 bool hasMed3_16() const { 450 return getGeneration() >= AMDGPUSubtarget::GFX9; 451 } 452 hasMin3Max3_16()453 bool hasMin3Max3_16() const { 454 return getGeneration() >= AMDGPUSubtarget::GFX9; 455 } 456 hasFmaMixInsts()457 bool hasFmaMixInsts() const { 458 return HasFmaMixInsts; 459 } 460 hasCARRY()461 bool hasCARRY() const { 462 return true; 463 } 464 hasFMA()465 bool hasFMA() const { 466 return FMA; 467 } 468 hasSwap()469 bool hasSwap() const { 470 return GFX9Insts; 471 } 472 hasScalarPackInsts()473 bool hasScalarPackInsts() const { 474 return GFX9Insts; 475 } 476 hasScalarMulHiInsts()477 bool hasScalarMulHiInsts() const { 478 return GFX9Insts; 479 } 480 hasScalarSubwordLoads()481 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; } 482 getTrapHandlerAbi()483 TrapHandlerAbi getTrapHandlerAbi() const { 484 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 485 } 486 supportsGetDoorbellID()487 bool supportsGetDoorbellID() const { 488 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 489 return getGeneration() >= GFX9; 490 } 491 492 /// True if the offset field of DS instructions works as expected. On SI, the 493 /// offset uses a 16-bit adder and does not always wrap properly. hasUsableDSOffset()494 bool hasUsableDSOffset() const { 495 return getGeneration() >= SEA_ISLANDS; 496 } 497 unsafeDSOffsetFoldingEnabled()498 bool unsafeDSOffsetFoldingEnabled() const { 499 return EnableUnsafeDSOffsetFolding; 500 } 501 502 /// Condition output from div_scale is usable. hasUsableDivScaleConditionOutput()503 bool hasUsableDivScaleConditionOutput() const { 504 return getGeneration() != SOUTHERN_ISLANDS; 505 } 506 507 /// Extra wait hazard is needed in some cases before 508 /// s_cbranch_vccnz/s_cbranch_vccz. hasReadVCCZBug()509 bool hasReadVCCZBug() const { 510 return getGeneration() <= SEA_ISLANDS; 511 } 512 513 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. partialVCCWritesUpdateVCCZ()514 bool partialVCCWritesUpdateVCCZ() const { 515 return getGeneration() >= GFX10; 516 } 517 518 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 519 /// was written by a VALU instruction. hasSMRDReadVALUDefHazard()520 bool hasSMRDReadVALUDefHazard() const { 521 return getGeneration() == SOUTHERN_ISLANDS; 522 } 523 524 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 525 /// SGPR was written by a VALU Instruction. hasVMEMReadSGPRVALUDefHazard()526 bool hasVMEMReadSGPRVALUDefHazard() const { 527 return getGeneration() >= VOLCANIC_ISLANDS; 528 } 529 hasRFEHazards()530 bool hasRFEHazards() const { 531 return getGeneration() >= VOLCANIC_ISLANDS; 532 } 533 534 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. getSetRegWaitStates()535 unsigned getSetRegWaitStates() const { 536 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 537 } 538 dumpCode()539 bool dumpCode() const { 540 return DumpCode; 541 } 542 543 /// Return the amount of LDS that can be used that will not restrict the 544 /// occupancy lower than WaveCount. 545 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 546 const Function &) const; 547 supportsMinMaxDenormModes()548 bool supportsMinMaxDenormModes() const { 549 return getGeneration() >= AMDGPUSubtarget::GFX9; 550 } 551 552 /// \returns If target supports S_DENORM_MODE. hasDenormModeInst()553 bool hasDenormModeInst() const { 554 return getGeneration() >= AMDGPUSubtarget::GFX10; 555 } 556 useFlatForGlobal()557 bool useFlatForGlobal() const { 558 return FlatForGlobal; 559 } 560 561 /// \returns If target supports ds_read/write_b128 and user enables generation 562 /// of ds_read/write_b128. useDS128()563 bool useDS128() const { 564 return CIInsts && EnableDS128; 565 } 566 567 /// \return If target supports ds_read/write_b96/128. hasDS96AndDS128()568 bool hasDS96AndDS128() const { 569 return CIInsts; 570 } 571 572 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 haveRoundOpsF64()573 bool haveRoundOpsF64() const { 574 return CIInsts; 575 } 576 577 /// \returns If MUBUF instructions always perform range checking, even for 578 /// buffer resources used for private memory access. privateMemoryResourceIsRangeChecked()579 bool privateMemoryResourceIsRangeChecked() const { 580 return getGeneration() < AMDGPUSubtarget::GFX9; 581 } 582 583 /// \returns If target requires PRT Struct NULL support (zero result registers 584 /// for sparse texture support). usePRTStrictNull()585 bool usePRTStrictNull() const { 586 return EnablePRTStrictNull; 587 } 588 hasAutoWaitcntBeforeBarrier()589 bool hasAutoWaitcntBeforeBarrier() const { 590 return AutoWaitcntBeforeBarrier; 591 } 592 593 /// \returns true if the target supports backing off of s_barrier instructions 594 /// when an exception is raised. supportsBackOffBarrier()595 bool supportsBackOffBarrier() const { 596 return BackOffBarrier; 597 } 598 hasUnalignedBufferAccess()599 bool hasUnalignedBufferAccess() const { 600 return UnalignedBufferAccess; 601 } 602 hasUnalignedBufferAccessEnabled()603 bool hasUnalignedBufferAccessEnabled() const { 604 return UnalignedBufferAccess && UnalignedAccessMode; 605 } 606 hasUnalignedDSAccess()607 bool hasUnalignedDSAccess() const { 608 return UnalignedDSAccess; 609 } 610 hasUnalignedDSAccessEnabled()611 bool hasUnalignedDSAccessEnabled() const { 612 return UnalignedDSAccess && UnalignedAccessMode; 613 } 614 hasUnalignedScratchAccess()615 bool hasUnalignedScratchAccess() const { 616 return UnalignedScratchAccess; 617 } 618 hasUnalignedScratchAccessEnabled()619 bool hasUnalignedScratchAccessEnabled() const { 620 return UnalignedScratchAccess && UnalignedAccessMode; 621 } 622 hasUnalignedAccessMode()623 bool hasUnalignedAccessMode() const { 624 return UnalignedAccessMode; 625 } 626 hasRelaxedBufferOOBMode()627 bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; } 628 hasApertureRegs()629 bool hasApertureRegs() const { 630 return HasApertureRegs; 631 } 632 isTrapHandlerEnabled()633 bool isTrapHandlerEnabled() const { 634 return TrapHandler; 635 } 636 isXNACKEnabled()637 bool isXNACKEnabled() const { 638 return TargetID.isXnackOnOrAny(); 639 } 640 isTgSplitEnabled()641 bool isTgSplitEnabled() const { 642 return EnableTgSplit; 643 } 644 isCuModeEnabled()645 bool isCuModeEnabled() const { 646 return EnableCuMode; 647 } 648 isPreciseMemoryEnabled()649 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } 650 hasFlatAddressSpace()651 bool hasFlatAddressSpace() const { 652 return FlatAddressSpace; 653 } 654 hasFlatScrRegister()655 bool hasFlatScrRegister() const { 656 return hasFlatAddressSpace(); 657 } 658 hasFlatInstOffsets()659 bool hasFlatInstOffsets() const { 660 return FlatInstOffsets; 661 } 662 hasFlatGlobalInsts()663 bool hasFlatGlobalInsts() const { 664 return FlatGlobalInsts; 665 } 666 hasFlatScratchInsts()667 bool hasFlatScratchInsts() const { 668 return FlatScratchInsts; 669 } 670 671 // Check if target supports ST addressing mode with FLAT scratch instructions. 672 // The ST addressing mode means no registers are used, either VGPR or SGPR, 673 // but only immediate offset is swizzled and added to the FLAT scratch base. hasFlatScratchSTMode()674 bool hasFlatScratchSTMode() const { 675 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 676 } 677 hasFlatScratchSVSMode()678 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 679 hasScalarFlatScratchInsts()680 bool hasScalarFlatScratchInsts() const { 681 return ScalarFlatScratchInsts; 682 } 683 enableFlatScratch()684 bool enableFlatScratch() const { 685 return flatScratchIsArchitected() || 686 (EnableFlatScratch && hasFlatScratchInsts()); 687 } 688 hasGlobalAddTidInsts()689 bool hasGlobalAddTidInsts() const { 690 return GFX10_BEncoding; 691 } 692 hasAtomicCSub()693 bool hasAtomicCSub() const { 694 return GFX10_BEncoding; 695 } 696 hasMTBUFInsts()697 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); } 698 hasFormattedMUBUFInsts()699 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); } 700 hasExportInsts()701 bool hasExportInsts() const { 702 return !hasGFX940Insts() && !hasGFX1250Insts(); 703 } 704 hasVINTERPEncoding()705 bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); } 706 707 // DS_ADD_F64/DS_ADD_RTN_F64 hasLdsAtomicAddF64()708 bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } 709 hasMultiDwordFlatScratchAddressing()710 bool hasMultiDwordFlatScratchAddressing() const { 711 return getGeneration() >= GFX9; 712 } 713 hasFlatSegmentOffsetBug()714 bool hasFlatSegmentOffsetBug() const { 715 return HasFlatSegmentOffsetBug; 716 } 717 hasFlatLgkmVMemCountInOrder()718 bool hasFlatLgkmVMemCountInOrder() const { 719 return getGeneration() > GFX9; 720 } 721 hasD16LoadStore()722 bool hasD16LoadStore() const { 723 return getGeneration() >= GFX9; 724 } 725 d16PreservesUnusedBits()726 bool d16PreservesUnusedBits() const { 727 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 728 } 729 hasD16Images()730 bool hasD16Images() const { 731 return getGeneration() >= VOLCANIC_ISLANDS; 732 } 733 734 /// Return if most LDS instructions have an m0 use that require m0 to be 735 /// initialized. ldsRequiresM0Init()736 bool ldsRequiresM0Init() const { 737 return getGeneration() < GFX9; 738 } 739 740 // True if the hardware rewinds and replays GWS operations if a wave is 741 // preempted. 742 // 743 // If this is false, a GWS operation requires testing if a nack set the 744 // MEM_VIOL bit, and repeating if so. hasGWSAutoReplay()745 bool hasGWSAutoReplay() const { 746 return getGeneration() >= GFX9; 747 } 748 749 /// \returns if target has ds_gws_sema_release_all instruction. hasGWSSemaReleaseAll()750 bool hasGWSSemaReleaseAll() const { 751 return CIInsts; 752 } 753 754 /// \returns true if the target has integer add/sub instructions that do not 755 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 756 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 757 /// for saturation. hasAddNoCarry()758 bool hasAddNoCarry() const { 759 return AddNoCarryInsts; 760 } 761 hasScalarAddSub64()762 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } 763 hasScalarSMulU64()764 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } 765 hasUnpackedD16VMem()766 bool hasUnpackedD16VMem() const { 767 return HasUnpackedD16VMem; 768 } 769 770 // Covers VS/PS/CS graphics shaders isMesaGfxShader(const Function & F)771 bool isMesaGfxShader(const Function &F) const { 772 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 773 } 774 hasMad64_32()775 bool hasMad64_32() const { 776 return getGeneration() >= SEA_ISLANDS; 777 } 778 hasSDWAOmod()779 bool hasSDWAOmod() const { 780 return HasSDWAOmod; 781 } 782 hasSDWAScalar()783 bool hasSDWAScalar() const { 784 return HasSDWAScalar; 785 } 786 hasSDWASdst()787 bool hasSDWASdst() const { 788 return HasSDWASdst; 789 } 790 hasSDWAMac()791 bool hasSDWAMac() const { 792 return HasSDWAMac; 793 } 794 hasSDWAOutModsVOPC()795 bool hasSDWAOutModsVOPC() const { 796 return HasSDWAOutModsVOPC; 797 } 798 hasDLInsts()799 bool hasDLInsts() const { 800 return HasDLInsts; 801 } 802 hasFmacF64Inst()803 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 804 hasDot1Insts()805 bool hasDot1Insts() const { 806 return HasDot1Insts; 807 } 808 hasDot2Insts()809 bool hasDot2Insts() const { 810 return HasDot2Insts; 811 } 812 hasDot3Insts()813 bool hasDot3Insts() const { 814 return HasDot3Insts; 815 } 816 hasDot4Insts()817 bool hasDot4Insts() const { 818 return HasDot4Insts; 819 } 820 hasDot5Insts()821 bool hasDot5Insts() const { 822 return HasDot5Insts; 823 } 824 hasDot6Insts()825 bool hasDot6Insts() const { 826 return HasDot6Insts; 827 } 828 hasDot7Insts()829 bool hasDot7Insts() const { 830 return HasDot7Insts; 831 } 832 hasDot8Insts()833 bool hasDot8Insts() const { 834 return HasDot8Insts; 835 } 836 hasDot9Insts()837 bool hasDot9Insts() const { 838 return HasDot9Insts; 839 } 840 hasDot10Insts()841 bool hasDot10Insts() const { 842 return HasDot10Insts; 843 } 844 hasDot11Insts()845 bool hasDot11Insts() const { 846 return HasDot11Insts; 847 } 848 hasDot12Insts()849 bool hasDot12Insts() const { 850 return HasDot12Insts; 851 } 852 hasDot13Insts()853 bool hasDot13Insts() const { 854 return HasDot13Insts; 855 } 856 hasMAIInsts()857 bool hasMAIInsts() const { 858 return HasMAIInsts; 859 } 860 hasFP8Insts()861 bool hasFP8Insts() const { 862 return HasFP8Insts; 863 } 864 hasFP8ConversionInsts()865 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } 866 hasFP8E5M3Insts()867 bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; } 868 hasPkFmacF16Inst()869 bool hasPkFmacF16Inst() const { 870 return HasPkFmacF16Inst; 871 } 872 hasAtomicFMinFMaxF32GlobalInsts()873 bool hasAtomicFMinFMaxF32GlobalInsts() const { 874 return HasAtomicFMinFMaxF32GlobalInsts; 875 } 876 hasAtomicFMinFMaxF64GlobalInsts()877 bool hasAtomicFMinFMaxF64GlobalInsts() const { 878 return HasAtomicFMinFMaxF64GlobalInsts; 879 } 880 hasAtomicFMinFMaxF32FlatInsts()881 bool hasAtomicFMinFMaxF32FlatInsts() const { 882 return HasAtomicFMinFMaxF32FlatInsts; 883 } 884 hasAtomicFMinFMaxF64FlatInsts()885 bool hasAtomicFMinFMaxF64FlatInsts() const { 886 return HasAtomicFMinFMaxF64FlatInsts; 887 } 888 hasAtomicDsPkAdd16Insts()889 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } 890 hasAtomicFlatPkAdd16Insts()891 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } 892 hasAtomicFaddInsts()893 bool hasAtomicFaddInsts() const { 894 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 895 } 896 hasAtomicFaddRtnInsts()897 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 898 hasAtomicFaddNoRtnInsts()899 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 900 hasAtomicBufferGlobalPkAddF16NoRtnInsts()901 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { 902 return HasAtomicBufferGlobalPkAddF16NoRtnInsts; 903 } 904 hasAtomicBufferGlobalPkAddF16Insts()905 bool hasAtomicBufferGlobalPkAddF16Insts() const { 906 return HasAtomicBufferGlobalPkAddF16Insts; 907 } 908 hasAtomicGlobalPkAddBF16Inst()909 bool hasAtomicGlobalPkAddBF16Inst() const { 910 return HasAtomicGlobalPkAddBF16Inst; 911 } 912 hasAtomicBufferPkAddBF16Inst()913 bool hasAtomicBufferPkAddBF16Inst() const { 914 return HasAtomicBufferPkAddBF16Inst; 915 } 916 hasFlatAtomicFaddF32Inst()917 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 918 919 /// \return true if the target has flat, global, and buffer atomic fadd for 920 /// double. hasFlatBufferGlobalAtomicFaddF64Inst()921 bool hasFlatBufferGlobalAtomicFaddF64Inst() const { 922 return HasFlatBufferGlobalAtomicFaddF64Inst; 923 } 924 925 /// \return true if the target's flat, global, and buffer atomic fadd for 926 /// float supports denormal handling. hasMemoryAtomicFaddF32DenormalSupport()927 bool hasMemoryAtomicFaddF32DenormalSupport() const { 928 return HasMemoryAtomicFaddF32DenormalSupport; 929 } 930 931 /// \return true if atomic operations targeting fine-grained memory work 932 /// correctly at device scope, in allocations in host or peer PCIe device 933 /// memory. supportsAgentScopeFineGrainedRemoteMemoryAtomics()934 bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const { 935 return HasAgentScopeFineGrainedRemoteMemoryAtomics; 936 } 937 hasDefaultComponentZero()938 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } 939 hasDefaultComponentBroadcast()940 bool hasDefaultComponentBroadcast() const { 941 return HasDefaultComponentBroadcast; 942 } 943 hasNoSdstCMPX()944 bool hasNoSdstCMPX() const { 945 return HasNoSdstCMPX; 946 } 947 hasVscnt()948 bool hasVscnt() const { 949 return HasVscnt; 950 } 951 hasGetWaveIdInst()952 bool hasGetWaveIdInst() const { 953 return HasGetWaveIdInst; 954 } 955 hasSMemTimeInst()956 bool hasSMemTimeInst() const { 957 return HasSMemTimeInst; 958 } 959 hasShaderCyclesRegister()960 bool hasShaderCyclesRegister() const { 961 return HasShaderCyclesRegister; 962 } 963 hasShaderCyclesHiLoRegisters()964 bool hasShaderCyclesHiLoRegisters() const { 965 return HasShaderCyclesHiLoRegisters; 966 } 967 hasVOP3Literal()968 bool hasVOP3Literal() const { 969 return HasVOP3Literal; 970 } 971 hasNoDataDepHazard()972 bool hasNoDataDepHazard() const { 973 return HasNoDataDepHazard; 974 } 975 vmemWriteNeedsExpWaitcnt()976 bool vmemWriteNeedsExpWaitcnt() const { 977 return getGeneration() < SEA_ISLANDS; 978 } 979 hasInstPrefetch()980 bool hasInstPrefetch() const { 981 return getGeneration() == GFX10 || getGeneration() == GFX11; 982 } 983 hasPrefetch()984 bool hasPrefetch() const { return GFX12Insts; } 985 hasSafeSmemPrefetch()986 bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } 987 988 // Has s_cmpk_* instructions. hasSCmpK()989 bool hasSCmpK() const { return getGeneration() < GFX12; } 990 991 // Scratch is allocated in 256 dword per wave blocks for the entire 992 // wavefront. When viewed from the perspective of an arbitrary workitem, this 993 // is 4-byte aligned. 994 // 995 // Only 4-byte alignment is really needed to access anything. Transformations 996 // on the pointer value itself may rely on the alignment / known low bits of 997 // the pointer. Set this to something above the minimum to avoid needing 998 // dynamic realignment in common cases. getStackAlignment()999 Align getStackAlignment() const { return Align(16); } 1000 enableMachineScheduler()1001 bool enableMachineScheduler() const override { 1002 return true; 1003 } 1004 1005 bool useAA() const override; 1006 enableSubRegLiveness()1007 bool enableSubRegLiveness() const override { 1008 return true; 1009 } 1010 setScalarizeGlobalBehavior(bool b)1011 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } getScalarizeGlobalBehavior()1012 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 1013 1014 // static wrappers 1015 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 1016 1017 // XXX - Why is this here if it isn't in the default pass set? enableEarlyIfConversion()1018 bool enableEarlyIfConversion() const override { 1019 return true; 1020 } 1021 1022 void overrideSchedPolicy(MachineSchedPolicy &Policy, 1023 unsigned NumRegionInstrs) const override; 1024 1025 void mirFileLoaded(MachineFunction &MF) const override; 1026 getMaxNumUserSGPRs()1027 unsigned getMaxNumUserSGPRs() const { 1028 return AMDGPU::getMaxNumUserSGPRs(*this); 1029 } 1030 hasSMemRealTime()1031 bool hasSMemRealTime() const { 1032 return HasSMemRealTime; 1033 } 1034 hasMovrel()1035 bool hasMovrel() const { 1036 return HasMovrel; 1037 } 1038 hasVGPRIndexMode()1039 bool hasVGPRIndexMode() const { 1040 return HasVGPRIndexMode; 1041 } 1042 1043 bool useVGPRIndexMode() const; 1044 hasScalarCompareEq64()1045 bool hasScalarCompareEq64() const { 1046 return getGeneration() >= VOLCANIC_ISLANDS; 1047 } 1048 hasScalarDwordx3Loads()1049 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; } 1050 hasScalarStores()1051 bool hasScalarStores() const { 1052 return HasScalarStores; 1053 } 1054 hasScalarAtomics()1055 bool hasScalarAtomics() const { 1056 return HasScalarAtomics; 1057 } 1058 hasLDSFPAtomicAddF32()1059 bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } hasLDSFPAtomicAddF64()1060 bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } 1061 1062 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. hasPermLaneX16()1063 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 1064 1065 /// \returns true if the subtarget has the v_permlane64_b32 instruction. hasPermLane64()1066 bool hasPermLane64() const { return getGeneration() >= GFX11; } 1067 hasDPP()1068 bool hasDPP() const { 1069 return HasDPP; 1070 } 1071 hasDPPBroadcasts()1072 bool hasDPPBroadcasts() const { 1073 return HasDPP && getGeneration() < GFX10; 1074 } 1075 hasDPPWavefrontShifts()1076 bool hasDPPWavefrontShifts() const { 1077 return HasDPP && getGeneration() < GFX10; 1078 } 1079 hasDPP8()1080 bool hasDPP8() const { 1081 return HasDPP8; 1082 } 1083 hasDPALU_DPP()1084 bool hasDPALU_DPP() const { 1085 return HasDPALU_DPP; 1086 } 1087 hasDPPSrc1SGPR()1088 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } 1089 hasPackedFP32Ops()1090 bool hasPackedFP32Ops() const { 1091 return HasPackedFP32Ops; 1092 } 1093 1094 // Has V_PK_MOV_B32 opcode hasPkMovB32()1095 bool hasPkMovB32() const { 1096 return GFX90AInsts; 1097 } 1098 hasFmaakFmamkF32Insts()1099 bool hasFmaakFmamkF32Insts() const { 1100 return getGeneration() >= GFX10 || hasGFX940Insts(); 1101 } 1102 hasFmaakFmamkF64Insts()1103 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); } 1104 hasImageInsts()1105 bool hasImageInsts() const { 1106 return HasImageInsts; 1107 } 1108 hasExtendedImageInsts()1109 bool hasExtendedImageInsts() const { 1110 return HasExtendedImageInsts; 1111 } 1112 hasR128A16()1113 bool hasR128A16() const { 1114 return HasR128A16; 1115 } 1116 hasA16()1117 bool hasA16() const { return HasA16; } 1118 hasG16()1119 bool hasG16() const { return HasG16; } 1120 hasOffset3fBug()1121 bool hasOffset3fBug() const { 1122 return HasOffset3fBug; 1123 } 1124 hasImageStoreD16Bug()1125 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 1126 hasImageGather4D16Bug()1127 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 1128 hasMADIntraFwdBug()1129 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 1130 hasMSAALoadDstSelBug()1131 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } 1132 hasPrivEnabledTrap2NopBug()1133 bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; } 1134 hasNSAEncoding()1135 bool hasNSAEncoding() const { return HasNSAEncoding; } 1136 hasNonNSAEncoding()1137 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } 1138 hasPartialNSAEncoding()1139 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } 1140 1141 unsigned getNSAMaxSize(bool HasSampler = false) const { 1142 return AMDGPU::getNSAMaxSize(*this, HasSampler); 1143 } 1144 hasGFX10_AEncoding()1145 bool hasGFX10_AEncoding() const { 1146 return GFX10_AEncoding; 1147 } 1148 hasGFX10_BEncoding()1149 bool hasGFX10_BEncoding() const { 1150 return GFX10_BEncoding; 1151 } 1152 hasGFX10_3Insts()1153 bool hasGFX10_3Insts() const { 1154 return GFX10_3Insts; 1155 } 1156 1157 bool hasMadF16() const; 1158 hasMovB64()1159 bool hasMovB64() const { return GFX940Insts || GFX1250Insts; } 1160 hasLshlAddU64Inst()1161 bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } 1162 enableSIScheduler()1163 bool enableSIScheduler() const { 1164 return EnableSIScheduler; 1165 } 1166 loadStoreOptEnabled()1167 bool loadStoreOptEnabled() const { 1168 return EnableLoadStoreOpt; 1169 } 1170 hasSGPRInitBug()1171 bool hasSGPRInitBug() const { 1172 return SGPRInitBug; 1173 } 1174 hasUserSGPRInit16Bug()1175 bool hasUserSGPRInit16Bug() const { 1176 return UserSGPRInit16Bug && isWave32(); 1177 } 1178 hasNegativeScratchOffsetBug()1179 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 1180 hasNegativeUnalignedScratchOffsetBug()1181 bool hasNegativeUnalignedScratchOffsetBug() const { 1182 return NegativeUnalignedScratchOffsetBug; 1183 } 1184 hasMFMAInlineLiteralBug()1185 bool hasMFMAInlineLiteralBug() const { 1186 return HasMFMAInlineLiteralBug; 1187 } 1188 has12DWordStoreHazard()1189 bool has12DWordStoreHazard() const { 1190 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1191 } 1192 1193 // \returns true if the subtarget supports DWORDX3 load/store instructions. hasDwordx3LoadStores()1194 bool hasDwordx3LoadStores() const { 1195 return CIInsts; 1196 } 1197 hasReadM0MovRelInterpHazard()1198 bool hasReadM0MovRelInterpHazard() const { 1199 return getGeneration() == AMDGPUSubtarget::GFX9; 1200 } 1201 hasReadM0SendMsgHazard()1202 bool hasReadM0SendMsgHazard() const { 1203 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1204 getGeneration() <= AMDGPUSubtarget::GFX9; 1205 } 1206 hasReadM0LdsDmaHazard()1207 bool hasReadM0LdsDmaHazard() const { 1208 return getGeneration() == AMDGPUSubtarget::GFX9; 1209 } 1210 hasReadM0LdsDirectHazard()1211 bool hasReadM0LdsDirectHazard() const { 1212 return getGeneration() == AMDGPUSubtarget::GFX9; 1213 } 1214 hasVcmpxPermlaneHazard()1215 bool hasVcmpxPermlaneHazard() const { 1216 return HasVcmpxPermlaneHazard; 1217 } 1218 hasVMEMtoScalarWriteHazard()1219 bool hasVMEMtoScalarWriteHazard() const { 1220 return HasVMEMtoScalarWriteHazard; 1221 } 1222 hasSMEMtoVectorWriteHazard()1223 bool hasSMEMtoVectorWriteHazard() const { 1224 return HasSMEMtoVectorWriteHazard; 1225 } 1226 hasLDSMisalignedBug()1227 bool hasLDSMisalignedBug() const { 1228 return LDSMisalignedBug && !EnableCuMode; 1229 } 1230 hasInstFwdPrefetchBug()1231 bool hasInstFwdPrefetchBug() const { 1232 return HasInstFwdPrefetchBug; 1233 } 1234 hasVcmpxExecWARHazard()1235 bool hasVcmpxExecWARHazard() const { 1236 return HasVcmpxExecWARHazard; 1237 } 1238 hasLdsBranchVmemWARHazard()1239 bool hasLdsBranchVmemWARHazard() const { 1240 return HasLdsBranchVmemWARHazard; 1241 } 1242 1243 // Shift amount of a 64 bit shift cannot be a highest allocated register 1244 // if also at the end of the allocation block. hasShift64HighRegBug()1245 bool hasShift64HighRegBug() const { 1246 return GFX90AInsts && !GFX940Insts; 1247 } 1248 1249 // Has one cycle hazard on transcendental instruction feeding a 1250 // non transcendental VALU. hasTransForwardingHazard()1251 bool hasTransForwardingHazard() const { return GFX940Insts; } 1252 1253 // Has one cycle hazard on a VALU instruction partially writing dst with 1254 // a shift of result bits feeding another VALU instruction. hasDstSelForwardingHazard()1255 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1256 1257 // Cannot use op_sel with v_dot instructions. hasDOTOpSelHazard()1258 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } 1259 1260 // Does not have HW interlocs for VALU writing and then reading SGPRs. hasVDecCoExecHazard()1261 bool hasVDecCoExecHazard() const { 1262 return GFX940Insts; 1263 } 1264 hasNSAtoVMEMBug()1265 bool hasNSAtoVMEMBug() const { 1266 return HasNSAtoVMEMBug; 1267 } 1268 hasNSAClauseBug()1269 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1270 hasHardClauses()1271 bool hasHardClauses() const { return MaxHardClauseLength > 0; } 1272 hasGFX90AInsts()1273 bool hasGFX90AInsts() const { return GFX90AInsts; } 1274 hasFPAtomicToDenormModeHazard()1275 bool hasFPAtomicToDenormModeHazard() const { 1276 return getGeneration() == GFX10; 1277 } 1278 hasVOP3DPP()1279 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1280 hasLdsDirect()1281 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1282 hasLdsWaitVMSRC()1283 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; } 1284 hasVALUPartialForwardingHazard()1285 bool hasVALUPartialForwardingHazard() const { 1286 return getGeneration() == GFX11; 1287 } 1288 hasVALUTransUseHazard()1289 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1290 hasCvtScaleForwardingHazard()1291 bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } 1292 requiresCodeObjectV6()1293 bool requiresCodeObjectV6() const { return RequiresCOV6; } 1294 useVGPRBlockOpsForCSR()1295 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; } 1296 hasVALUMaskWriteHazard()1297 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } 1298 hasVALUReadSGPRHazard()1299 bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; } 1300 1301 /// Return if operations acting on VGPR tuples require even alignment. needsAlignedVGPRs()1302 bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } 1303 1304 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. hasSPackHL()1305 bool hasSPackHL() const { return GFX11Insts; } 1306 1307 /// Return true if the target's EXP instruction has the COMPR flag, which 1308 /// affects the meaning of the EN (enable) bits. hasCompressedExport()1309 bool hasCompressedExport() const { return !GFX11Insts; } 1310 1311 /// Return true if the target's EXP instruction supports the NULL export 1312 /// target. hasNullExportTarget()1313 bool hasNullExportTarget() const { return !GFX11Insts; } 1314 has1_5xVGPRs()1315 bool has1_5xVGPRs() const { return Has1_5xVGPRs; } 1316 hasVOPDInsts()1317 bool hasVOPDInsts() const { return HasVOPDInsts; } 1318 hasFlatScratchSVSSwizzleBug()1319 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1320 1321 /// Return true if the target has the S_DELAY_ALU instruction. hasDelayAlu()1322 bool hasDelayAlu() const { return GFX11Insts; } 1323 hasPackedTID()1324 bool hasPackedTID() const { return HasPackedTID; } 1325 1326 // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that 1327 // hasGFX90AInsts is also true. hasGFX940Insts()1328 bool hasGFX940Insts() const { return GFX940Insts; } 1329 1330 // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that 1331 // hasGFX940Insts and hasGFX90AInsts are also true. hasGFX950Insts()1332 bool hasGFX950Insts() const { return GFX950Insts; } 1333 1334 /// Returns true if the target supports 1335 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or 1336 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit. hasLDSLoadB96_B128()1337 bool hasLDSLoadB96_B128() const { 1338 return hasGFX950Insts(); 1339 } 1340 hasVMemToLDSLoad()1341 bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; } 1342 hasSALUFloatInsts()1343 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } 1344 hasPseudoScalarTrans()1345 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } 1346 hasRestrictedSOffset()1347 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } 1348 hasRequiredExportPriority()1349 bool hasRequiredExportPriority() const { return HasRequiredExportPriority; } 1350 hasVmemWriteVgprInOrder()1351 bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; } 1352 1353 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt 1354 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. hasExtendedWaitCounts()1355 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } 1356 1357 /// \returns true if inline constants are not supported for F16 pseudo 1358 /// scalar transcendentals. hasNoF16PseudoScalarTransInlineConstants()1359 bool hasNoF16PseudoScalarTransInlineConstants() const { 1360 return getGeneration() == GFX12; 1361 } 1362 1363 /// \returns true if the target has instructions with xf32 format support. hasXF32Insts()1364 bool hasXF32Insts() const { return HasXF32Insts; } 1365 hasBitOp3Insts()1366 bool hasBitOp3Insts() const { return HasBitOp3Insts; } 1367 hasPermlane16Swap()1368 bool hasPermlane16Swap() const { return HasPermlane16Swap; } hasPermlane32Swap()1369 bool hasPermlane32Swap() const { return HasPermlane32Swap; } hasAshrPkInsts()1370 bool hasAshrPkInsts() const { return HasAshrPkInsts; } 1371 hasMinimum3Maximum3F32()1372 bool hasMinimum3Maximum3F32() const { 1373 return HasMinimum3Maximum3F32; 1374 } 1375 hasMinimum3Maximum3F16()1376 bool hasMinimum3Maximum3F16() const { 1377 return HasMinimum3Maximum3F16; 1378 } 1379 hasMinimum3Maximum3PKF16()1380 bool hasMinimum3Maximum3PKF16() const { 1381 return HasMinimum3Maximum3PKF16; 1382 } 1383 hasTransposeLoadF4F6Insts()1384 bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; } 1385 1386 /// \returns true if the target has s_wait_xcnt insertion. Supported for 1387 /// GFX1250. hasWaitXCnt()1388 bool hasWaitXCnt() const { return HasWaitXcnt; } 1389 1390 // A single DWORD instructions can use a 64-bit literal. has64BitLiterals()1391 bool has64BitLiterals() const { return Has64BitLiterals; } 1392 hasPointSampleAccel()1393 bool hasPointSampleAccel() const { return HasPointSampleAccel; } 1394 hasLdsBarrierArriveAtomic()1395 bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; } 1396 1397 /// \returns The maximum number of instructions that can be enclosed in an 1398 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that 1399 /// instruction. maxHardClauseLength()1400 unsigned maxHardClauseLength() const { return MaxHardClauseLength; } 1401 hasPrngInst()1402 bool hasPrngInst() const { return HasPrngInst; } 1403 hasBVHDualAndBVH8Insts()1404 bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; } 1405 1406 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1407 /// SGPRs 1408 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1409 1410 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1411 /// VGPRs 1412 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, 1413 unsigned DynamicVGPRBlockSize) const; 1414 1415 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can 1416 /// be achieved when the only function running on a CU is \p F, each workgroup 1417 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p 1418 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a 1419 /// range, so this returns a range as well. 1420 /// 1421 /// Note that occupancy can be affected by the scratch allocation as well, but 1422 /// we do not have enough information to compute it. 1423 std::pair<unsigned, unsigned> computeOccupancy(const Function &F, 1424 unsigned LDSSize = 0, 1425 unsigned NumSGPRs = 0, 1426 unsigned NumVGPRs = 0) const; 1427 1428 /// \returns true if the flat_scratch register should be initialized with the 1429 /// pointer to the wave's scratch memory rather than a size and offset. flatScratchIsPointer()1430 bool flatScratchIsPointer() const { 1431 return getGeneration() >= AMDGPUSubtarget::GFX9; 1432 } 1433 1434 /// \returns true if the flat_scratch register is initialized by the HW. 1435 /// In this case it is readonly. flatScratchIsArchitected()1436 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1437 1438 /// \returns true if the architected SGPRs are enabled. hasArchitectedSGPRs()1439 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } 1440 1441 /// \returns true if Global Data Share is supported. hasGDS()1442 bool hasGDS() const { return HasGDS; } 1443 1444 /// \returns true if Global Wave Sync is supported. hasGWS()1445 bool hasGWS() const { return HasGWS; } 1446 1447 /// \returns true if the machine has merged shaders in which s0-s7 are 1448 /// reserved by the hardware and user SGPRs start at s8 hasMergedShaders()1449 bool hasMergedShaders() const { 1450 return getGeneration() >= GFX9; 1451 } 1452 1453 // \returns true if the target supports the pre-NGG legacy geometry path. hasLegacyGeometry()1454 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1455 1456 // \returns true if preloading kernel arguments is supported. hasKernargPreload()1457 bool hasKernargPreload() const { return KernargPreload; } 1458 1459 // \returns true if the target has split barriers feature hasSplitBarriers()1460 bool hasSplitBarriers() const { return getGeneration() >= GFX12; } 1461 1462 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. hasCvtFP8VOP1Bug()1463 bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; } 1464 1465 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a 1466 // no-return form. hasAtomicCSubNoRtnInsts()1467 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; } 1468 1469 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit hasDX10ClampMode()1470 bool hasDX10ClampMode() const { return getGeneration() < GFX12; } 1471 1472 // \returns true if the target has IEEE kernel descriptor mode bit hasIEEEMode()1473 bool hasIEEEMode() const { return getGeneration() < GFX12; } 1474 1475 // \returns true if the target has IEEE fminimum/fmaximum instructions hasIEEEMinimumMaximumInsts()1476 bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; } 1477 1478 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit hasRrWGMode()1479 bool hasRrWGMode() const { return getGeneration() >= GFX12; } 1480 1481 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative 1482 /// values. hasSignedScratchOffsets()1483 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } 1484 hasGFX1250Insts()1485 bool hasGFX1250Insts() const { return GFX1250Insts; } 1486 hasVOPD3()1487 bool hasVOPD3() const { return GFX1250Insts; } 1488 1489 // \returns true if target has S_SETPRIO_INC_WG instruction. hasSetPrioIncWgInst()1490 bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; } 1491 1492 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead 1493 // of sign-extending. hasGetPCZeroExtension()1494 bool hasGetPCZeroExtension() const { return GFX12Insts; } 1495 1496 /// \returns SGPR allocation granularity supported by the subtarget. getSGPRAllocGranule()1497 unsigned getSGPRAllocGranule() const { 1498 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1499 } 1500 1501 /// \returns SGPR encoding granularity supported by the subtarget. getSGPREncodingGranule()1502 unsigned getSGPREncodingGranule() const { 1503 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1504 } 1505 1506 /// \returns Total number of SGPRs supported by the subtarget. getTotalNumSGPRs()1507 unsigned getTotalNumSGPRs() const { 1508 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1509 } 1510 1511 /// \returns Addressable number of SGPRs supported by the subtarget. getAddressableNumSGPRs()1512 unsigned getAddressableNumSGPRs() const { 1513 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1514 } 1515 1516 /// \returns Minimum number of SGPRs that meets the given number of waves per 1517 /// execution unit requirement supported by the subtarget. getMinNumSGPRs(unsigned WavesPerEU)1518 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1519 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1520 } 1521 1522 /// \returns Maximum number of SGPRs that meets the given number of waves per 1523 /// execution unit requirement supported by the subtarget. getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1524 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1525 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1526 } 1527 1528 /// \returns Reserved number of SGPRs. This is common 1529 /// utility function called by MachineFunction and 1530 /// Function variants of getReservedNumSGPRs. 1531 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1532 /// \returns Reserved number of SGPRs for given machine function \p MF. 1533 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1534 1535 /// \returns Reserved number of SGPRs for given function \p F. 1536 unsigned getReservedNumSGPRs(const Function &F) const; 1537 1538 /// \returns Maximum number of preloaded SGPRs for the subtarget. 1539 unsigned getMaxNumPreloadedSGPRs() const; 1540 1541 /// \returns max num SGPRs. This is the common utility 1542 /// function called by MachineFunction and Function 1543 /// variants of getMaxNumSGPRs. 1544 unsigned getBaseMaxNumSGPRs(const Function &F, 1545 std::pair<unsigned, unsigned> WavesPerEU, 1546 unsigned PreloadedSGPRs, 1547 unsigned ReservedNumSGPRs) const; 1548 1549 /// \returns Maximum number of SGPRs that meets number of waves per execution 1550 /// unit requirement for function \p MF, or number of SGPRs explicitly 1551 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1552 /// 1553 /// \returns Value that meets number of waves per execution unit requirement 1554 /// if explicitly requested value cannot be converted to integer, violates 1555 /// subtarget's specifications, or does not meet number of waves per execution 1556 /// unit requirement. 1557 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1558 1559 /// \returns Maximum number of SGPRs that meets number of waves per execution 1560 /// unit requirement for function \p F, or number of SGPRs explicitly 1561 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1562 /// 1563 /// \returns Value that meets number of waves per execution unit requirement 1564 /// if explicitly requested value cannot be converted to integer, violates 1565 /// subtarget's specifications, or does not meet number of waves per execution 1566 /// unit requirement. 1567 unsigned getMaxNumSGPRs(const Function &F) const; 1568 1569 /// \returns VGPR allocation granularity supported by the subtarget. getVGPRAllocGranule(unsigned DynamicVGPRBlockSize)1570 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const { 1571 return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize); 1572 } 1573 1574 /// \returns VGPR encoding granularity supported by the subtarget. getVGPREncodingGranule()1575 unsigned getVGPREncodingGranule() const { 1576 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1577 } 1578 1579 /// \returns Total number of VGPRs supported by the subtarget. getTotalNumVGPRs()1580 unsigned getTotalNumVGPRs() const { 1581 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1582 } 1583 1584 /// \returns Addressable number of architectural VGPRs supported by the 1585 /// subtarget. getAddressableNumArchVGPRs()1586 unsigned getAddressableNumArchVGPRs() const { 1587 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); 1588 } 1589 1590 /// \returns Addressable number of VGPRs supported by the subtarget. getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize)1591 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const { 1592 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize); 1593 } 1594 1595 /// \returns the minimum number of VGPRs that will prevent achieving more than 1596 /// the specified number of waves \p WavesPerEU. getMinNumVGPRs(unsigned WavesPerEU,unsigned DynamicVGPRBlockSize)1597 unsigned getMinNumVGPRs(unsigned WavesPerEU, 1598 unsigned DynamicVGPRBlockSize) const { 1599 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU, 1600 DynamicVGPRBlockSize); 1601 } 1602 1603 /// \returns the maximum number of VGPRs that can be used and still achieved 1604 /// at least the specified number of waves \p WavesPerEU. getMaxNumVGPRs(unsigned WavesPerEU,unsigned DynamicVGPRBlockSize)1605 unsigned getMaxNumVGPRs(unsigned WavesPerEU, 1606 unsigned DynamicVGPRBlockSize) const { 1607 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU, 1608 DynamicVGPRBlockSize); 1609 } 1610 1611 /// \returns max num VGPRs. This is the common utility function 1612 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1613 unsigned 1614 getBaseMaxNumVGPRs(const Function &F, 1615 std::pair<unsigned, unsigned> NumVGPRBounds) const; 1616 1617 /// \returns Maximum number of VGPRs that meets number of waves per execution 1618 /// unit requirement for function \p F, or number of VGPRs explicitly 1619 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1620 /// 1621 /// \returns Value that meets number of waves per execution unit requirement 1622 /// if explicitly requested value cannot be converted to integer, violates 1623 /// subtarget's specifications, or does not meet number of waves per execution 1624 /// unit requirement. 1625 unsigned getMaxNumVGPRs(const Function &F) const; 1626 getMaxNumAGPRs(const Function & F)1627 unsigned getMaxNumAGPRs(const Function &F) const { 1628 return getMaxNumVGPRs(F); 1629 } 1630 1631 /// \returns Maximum number of VGPRs that meets number of waves per execution 1632 /// unit requirement for function \p MF, or number of VGPRs explicitly 1633 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1634 /// 1635 /// \returns Value that meets number of waves per execution unit requirement 1636 /// if explicitly requested value cannot be converted to integer, violates 1637 /// subtarget's specifications, or does not meet number of waves per execution 1638 /// unit requirement. 1639 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1640 isWave32()1641 bool isWave32() const { 1642 return getWavefrontSize() == 32; 1643 } 1644 isWave64()1645 bool isWave64() const { 1646 return getWavefrontSize() == 64; 1647 } 1648 1649 /// Returns if the wavesize of this subtarget is known reliable. This is false 1650 /// only for the a default target-cpu that does not have an explicit 1651 /// +wavefrontsize target feature. isWaveSizeKnown()1652 bool isWaveSizeKnown() const { 1653 return hasFeature(AMDGPU::FeatureWavefrontSize32) || 1654 hasFeature(AMDGPU::FeatureWavefrontSize64); 1655 } 1656 getBoolRC()1657 const TargetRegisterClass *getBoolRC() const { 1658 return getRegisterInfo()->getBoolRC(); 1659 } 1660 1661 /// \returns Maximum number of work groups per compute unit supported by the 1662 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1663 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1664 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1665 } 1666 1667 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()1668 unsigned getMinFlatWorkGroupSize() const override { 1669 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1670 } 1671 1672 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()1673 unsigned getMaxFlatWorkGroupSize() const override { 1674 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1675 } 1676 1677 /// \returns Number of waves per execution unit required to support the given 1678 /// \p FlatWorkGroupSize. 1679 unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1680 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1681 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1682 } 1683 1684 /// \returns Minimum number of waves per execution unit supported by the 1685 /// subtarget. getMinWavesPerEU()1686 unsigned getMinWavesPerEU() const override { 1687 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1688 } 1689 1690 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1691 SDep &Dep, 1692 const TargetSchedModel *SchedModel) const override; 1693 1694 // \returns true if it's beneficial on this subtarget for the scheduler to 1695 // cluster stores as well as loads. shouldClusterStores()1696 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1697 1698 // \returns the number of address arguments from which to enable MIMG NSA 1699 // on supported architectures. 1700 unsigned getNSAThreshold(const MachineFunction &MF) const; 1701 1702 // \returns true if the subtarget has a hazard requiring an "s_nop 0" 1703 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". requiresNopBeforeDeallocVGPRs()1704 bool requiresNopBeforeDeallocVGPRs() const { 1705 // Currently all targets that support the dealloc VGPRs message also require 1706 // the nop. 1707 return true; 1708 } 1709 isDynamicVGPREnabled()1710 bool isDynamicVGPREnabled() const { return DynamicVGPR; } getDynamicVGPRBlockSize()1711 unsigned getDynamicVGPRBlockSize() const { 1712 return DynamicVGPRBlockSize32 ? 32 : 16; 1713 } 1714 requiresDisjointEarlyClobberAndUndef()1715 bool requiresDisjointEarlyClobberAndUndef() const override { 1716 // AMDGPU doesn't care if early-clobber and undef operands are allocated 1717 // to the same register. 1718 return false; 1719 } 1720 }; 1721 1722 class GCNUserSGPRUsageInfo { 1723 public: hasImplicitBufferPtr()1724 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } 1725 hasPrivateSegmentBuffer()1726 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } 1727 hasDispatchPtr()1728 bool hasDispatchPtr() const { return DispatchPtr; } 1729 hasQueuePtr()1730 bool hasQueuePtr() const { return QueuePtr; } 1731 hasKernargSegmentPtr()1732 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } 1733 hasDispatchID()1734 bool hasDispatchID() const { return DispatchID; } 1735 hasFlatScratchInit()1736 bool hasFlatScratchInit() const { return FlatScratchInit; } 1737 hasPrivateSegmentSize()1738 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; } 1739 getNumKernargPreloadSGPRs()1740 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } 1741 getNumUsedUserSGPRs()1742 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } 1743 1744 unsigned getNumFreeUserSGPRs(); 1745 1746 void allocKernargPreloadSGPRs(unsigned NumSGPRs); 1747 1748 enum UserSGPRID : unsigned { 1749 ImplicitBufferPtrID = 0, 1750 PrivateSegmentBufferID = 1, 1751 DispatchPtrID = 2, 1752 QueuePtrID = 3, 1753 KernargSegmentPtrID = 4, 1754 DispatchIdID = 5, 1755 FlatScratchInitID = 6, 1756 PrivateSegmentSizeID = 7 1757 }; 1758 1759 // Returns the size in number of SGPRs for preload user SGPR field. getNumUserSGPRForField(UserSGPRID ID)1760 static unsigned getNumUserSGPRForField(UserSGPRID ID) { 1761 switch (ID) { 1762 case ImplicitBufferPtrID: 1763 return 2; 1764 case PrivateSegmentBufferID: 1765 return 4; 1766 case DispatchPtrID: 1767 return 2; 1768 case QueuePtrID: 1769 return 2; 1770 case KernargSegmentPtrID: 1771 return 2; 1772 case DispatchIdID: 1773 return 2; 1774 case FlatScratchInitID: 1775 return 2; 1776 case PrivateSegmentSizeID: 1777 return 1; 1778 } 1779 llvm_unreachable("Unknown UserSGPRID."); 1780 } 1781 1782 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); 1783 1784 private: 1785 const GCNSubtarget &ST; 1786 1787 // Private memory buffer 1788 // Compute directly in sgpr[0:1] 1789 // Other shaders indirect 64-bits at sgpr[0:1] 1790 bool ImplicitBufferPtr = false; 1791 1792 bool PrivateSegmentBuffer = false; 1793 1794 bool DispatchPtr = false; 1795 1796 bool QueuePtr = false; 1797 1798 bool KernargSegmentPtr = false; 1799 1800 bool DispatchID = false; 1801 1802 bool FlatScratchInit = false; 1803 1804 bool PrivateSegmentSize = false; 1805 1806 unsigned NumKernargPreloadSGPRs = 0; 1807 1808 unsigned NumUsedUserSGPRs = 0; 1809 }; 1810 1811 } // end namespace llvm 1812 1813 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1814