1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "SIFrameLowering.h" 21 #include "SIISelLowering.h" 22 #include "SIInstrInfo.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 25 #include "llvm/Support/ErrorHandling.h" 26 27 #define GET_SUBTARGETINFO_HEADER 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 namespace llvm { 31 32 class GCNTargetMachine; 33 34 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 35 public AMDGPUSubtarget { 36 public: 37 using AMDGPUSubtarget::getMaxWavesPerEU; 38 39 // Following 2 enums are documented at: 40 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 41 enum class TrapHandlerAbi { 42 NONE = 0x00, 43 AMDHSA = 0x01, 44 }; 45 46 enum class TrapID { 47 LLVMAMDHSATrap = 0x02, 48 LLVMAMDHSADebugTrap = 0x03, 49 }; 50 51 private: 52 /// GlobalISel related APIs. 53 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 54 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 55 std::unique_ptr<InstructionSelector> InstSelector; 56 std::unique_ptr<LegalizerInfo> Legalizer; 57 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; 58 59 protected: 60 // Basic subtarget description. 61 Triple TargetTriple; 62 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 63 unsigned Gen = INVALID; 64 InstrItineraryData InstrItins; 65 int LDSBankCount = 0; 66 unsigned MaxPrivateElementSize = 0; 67 68 // Possibly statically set by tablegen, but may want to be overridden. 69 bool FastDenormalF32 = false; 70 bool HalfRate64Ops = false; 71 bool FullRate64Ops = false; 72 73 // Dynamically set bits that enable features. 74 bool FlatForGlobal = false; 75 bool AutoWaitcntBeforeBarrier = false; 76 bool BackOffBarrier = false; 77 bool UnalignedScratchAccess = false; 78 bool UnalignedAccessMode = false; 79 bool HasApertureRegs = false; 80 bool SupportsXNACK = false; 81 bool KernargPreload = false; 82 83 // This should not be used directly. 'TargetID' tracks the dynamic settings 84 // for XNACK. 85 bool EnableXNACK = false; 86 87 bool EnableTgSplit = false; 88 bool EnableCuMode = false; 89 bool TrapHandler = false; 90 91 // Used as options. 92 bool EnableLoadStoreOpt = false; 93 bool EnableUnsafeDSOffsetFolding = false; 94 bool EnableSIScheduler = false; 95 bool EnableDS128 = false; 96 bool EnablePRTStrictNull = false; 97 bool DumpCode = false; 98 99 // Subtarget statically properties set by tablegen 100 bool FP64 = false; 101 bool FMA = false; 102 bool MIMG_R128 = false; 103 bool CIInsts = false; 104 bool GFX8Insts = false; 105 bool GFX9Insts = false; 106 bool GFX90AInsts = false; 107 bool GFX940Insts = false; 108 bool GFX10Insts = false; 109 bool GFX11Insts = false; 110 bool GFX12Insts = false; 111 bool GFX10_3Insts = false; 112 bool GFX7GFX8GFX9Insts = false; 113 bool SGPRInitBug = false; 114 bool UserSGPRInit16Bug = false; 115 bool NegativeScratchOffsetBug = false; 116 bool NegativeUnalignedScratchOffsetBug = false; 117 bool HasSMemRealTime = false; 118 bool HasIntClamp = false; 119 bool HasFmaMixInsts = false; 120 bool HasMovrel = false; 121 bool HasVGPRIndexMode = false; 122 bool HasScalarDwordx3Loads = false; 123 bool HasScalarStores = false; 124 bool HasScalarAtomics = false; 125 bool HasSDWAOmod = false; 126 bool HasSDWAScalar = false; 127 bool HasSDWASdst = false; 128 bool HasSDWAMac = false; 129 bool HasSDWAOutModsVOPC = false; 130 bool HasDPP = false; 131 bool HasDPP8 = false; 132 bool HasDPALU_DPP = false; 133 bool HasDPPSrc1SGPR = false; 134 bool HasPackedFP32Ops = false; 135 bool HasImageInsts = false; 136 bool HasExtendedImageInsts = false; 137 bool HasR128A16 = false; 138 bool HasA16 = false; 139 bool HasG16 = false; 140 bool HasNSAEncoding = false; 141 bool HasPartialNSAEncoding = false; 142 bool GFX10_AEncoding = false; 143 bool GFX10_BEncoding = false; 144 bool HasDLInsts = false; 145 bool HasFmacF64Inst = false; 146 bool HasDot1Insts = false; 147 bool HasDot2Insts = false; 148 bool HasDot3Insts = false; 149 bool HasDot4Insts = false; 150 bool HasDot5Insts = false; 151 bool HasDot6Insts = false; 152 bool HasDot7Insts = false; 153 bool HasDot8Insts = false; 154 bool HasDot9Insts = false; 155 bool HasDot10Insts = false; 156 bool HasMAIInsts = false; 157 bool HasFP8Insts = false; 158 bool HasFP8ConversionInsts = false; 159 bool HasPkFmacF16Inst = false; 160 bool HasAtomicDsPkAdd16Insts = false; 161 bool HasAtomicFlatPkAdd16Insts = false; 162 bool HasAtomicFaddRtnInsts = false; 163 bool HasAtomicFaddNoRtnInsts = false; 164 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; 165 bool HasAtomicBufferGlobalPkAddF16Insts = false; 166 bool HasAtomicCSubNoRtnInsts = false; 167 bool HasAtomicGlobalPkAddBF16Inst = false; 168 bool HasFlatAtomicFaddF32Inst = false; 169 bool HasDefaultComponentZero = false; 170 bool HasDefaultComponentBroadcast = false; 171 bool SupportsSRAMECC = false; 172 173 // This should not be used directly. 'TargetID' tracks the dynamic settings 174 // for SRAMECC. 175 bool EnableSRAMECC = false; 176 177 bool HasNoSdstCMPX = false; 178 bool HasVscnt = false; 179 bool HasGetWaveIdInst = false; 180 bool HasSMemTimeInst = false; 181 bool HasShaderCyclesRegister = false; 182 bool HasShaderCyclesHiLoRegisters = false; 183 bool HasVOP3Literal = false; 184 bool HasNoDataDepHazard = false; 185 bool FlatAddressSpace = false; 186 bool FlatInstOffsets = false; 187 bool FlatGlobalInsts = false; 188 bool FlatScratchInsts = false; 189 bool ScalarFlatScratchInsts = false; 190 bool HasArchitectedFlatScratch = false; 191 bool EnableFlatScratch = false; 192 bool HasArchitectedSGPRs = false; 193 bool HasGDS = false; 194 bool HasGWS = false; 195 bool AddNoCarryInsts = false; 196 bool HasUnpackedD16VMem = false; 197 bool LDSMisalignedBug = false; 198 bool HasMFMAInlineLiteralBug = false; 199 bool UnalignedBufferAccess = false; 200 bool UnalignedDSAccess = false; 201 bool HasPackedTID = false; 202 bool ScalarizeGlobal = false; 203 bool HasSALUFloatInsts = false; 204 bool HasVGPRSingleUseHintInsts = false; 205 bool HasPseudoScalarTrans = false; 206 bool HasRestrictedSOffset = false; 207 208 bool HasVcmpxPermlaneHazard = false; 209 bool HasVMEMtoScalarWriteHazard = false; 210 bool HasSMEMtoVectorWriteHazard = false; 211 bool HasInstFwdPrefetchBug = false; 212 bool HasVcmpxExecWARHazard = false; 213 bool HasLdsBranchVmemWARHazard = false; 214 bool HasNSAtoVMEMBug = false; 215 bool HasNSAClauseBug = false; 216 bool HasOffset3fBug = false; 217 bool HasFlatSegmentOffsetBug = false; 218 bool HasImageStoreD16Bug = false; 219 bool HasImageGather4D16Bug = false; 220 bool HasMSAALoadDstSelBug = false; 221 bool HasGFX11FullVGPRs = false; 222 bool HasMADIntraFwdBug = false; 223 bool HasVOPDInsts = false; 224 bool HasVALUTransUseHazard = false; 225 bool HasForceStoreSC0SC1 = false; 226 227 // Dummy feature to use for assembler in tablegen. 228 bool FeatureDisable = false; 229 230 SelectionDAGTargetInfo TSInfo; 231 private: 232 SIInstrInfo InstrInfo; 233 SITargetLowering TLInfo; 234 SIFrameLowering FrameLowering; 235 236 public: 237 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 238 const GCNTargetMachine &TM); 239 ~GCNSubtarget() override; 240 241 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 242 StringRef GPU, StringRef FS); 243 244 const SIInstrInfo *getInstrInfo() const override { 245 return &InstrInfo; 246 } 247 248 const SIFrameLowering *getFrameLowering() const override { 249 return &FrameLowering; 250 } 251 252 const SITargetLowering *getTargetLowering() const override { 253 return &TLInfo; 254 } 255 256 const SIRegisterInfo *getRegisterInfo() const override { 257 return &InstrInfo.getRegisterInfo(); 258 } 259 260 const CallLowering *getCallLowering() const override { 261 return CallLoweringInfo.get(); 262 } 263 264 const InlineAsmLowering *getInlineAsmLowering() const override { 265 return InlineAsmLoweringInfo.get(); 266 } 267 268 InstructionSelector *getInstructionSelector() const override { 269 return InstSelector.get(); 270 } 271 272 const LegalizerInfo *getLegalizerInfo() const override { 273 return Legalizer.get(); 274 } 275 276 const AMDGPURegisterBankInfo *getRegBankInfo() const override { 277 return RegBankInfo.get(); 278 } 279 280 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 281 return TargetID; 282 } 283 284 // Nothing implemented, just prevent crashes on use. 285 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 286 return &TSInfo; 287 } 288 289 const InstrItineraryData *getInstrItineraryData() const override { 290 return &InstrItins; 291 } 292 293 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 294 295 Generation getGeneration() const { 296 return (Generation)Gen; 297 } 298 299 unsigned getMaxWaveScratchSize() const { 300 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 301 if (getGeneration() >= GFX12) { 302 // 18-bit field in units of 64-dword. 303 return (64 * 4) * ((1 << 18) - 1); 304 } 305 if (getGeneration() == GFX11) { 306 // 15-bit field in units of 64-dword. 307 return (64 * 4) * ((1 << 15) - 1); 308 } 309 // 13-bit field in units of 256-dword. 310 return (256 * 4) * ((1 << 13) - 1); 311 } 312 313 /// Return the number of high bits known to be zero for a frame index. 314 unsigned getKnownHighZeroBitsForFrameIndex() const { 315 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 316 } 317 318 int getLDSBankCount() const { 319 return LDSBankCount; 320 } 321 322 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 323 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 324 } 325 326 unsigned getConstantBusLimit(unsigned Opcode) const; 327 328 /// Returns if the result of this instruction with a 16-bit result returned in 329 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 330 /// the original value. 331 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 332 333 bool supportsWGP() const { return getGeneration() >= GFX10; } 334 335 bool hasIntClamp() const { 336 return HasIntClamp; 337 } 338 339 bool hasFP64() const { 340 return FP64; 341 } 342 343 bool hasMIMG_R128() const { 344 return MIMG_R128; 345 } 346 347 bool hasHWFP64() const { 348 return FP64; 349 } 350 351 bool hasHalfRate64Ops() const { 352 return HalfRate64Ops; 353 } 354 355 bool hasFullRate64Ops() const { 356 return FullRate64Ops; 357 } 358 359 bool hasAddr64() const { 360 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 361 } 362 363 bool hasFlat() const { 364 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 365 } 366 367 // Return true if the target only has the reverse operand versions of VALU 368 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 369 bool hasOnlyRevVALUShifts() const { 370 return getGeneration() >= VOLCANIC_ISLANDS; 371 } 372 373 bool hasFractBug() const { 374 return getGeneration() == SOUTHERN_ISLANDS; 375 } 376 377 bool hasBFE() const { 378 return true; 379 } 380 381 bool hasBFI() const { 382 return true; 383 } 384 385 bool hasBFM() const { 386 return hasBFE(); 387 } 388 389 bool hasBCNT(unsigned Size) const { 390 return true; 391 } 392 393 bool hasFFBL() const { 394 return true; 395 } 396 397 bool hasFFBH() const { 398 return true; 399 } 400 401 bool hasMed3_16() const { 402 return getGeneration() >= AMDGPUSubtarget::GFX9; 403 } 404 405 bool hasMin3Max3_16() const { 406 return getGeneration() >= AMDGPUSubtarget::GFX9; 407 } 408 409 bool hasFmaMixInsts() const { 410 return HasFmaMixInsts; 411 } 412 413 bool hasCARRY() const { 414 return true; 415 } 416 417 bool hasFMA() const { 418 return FMA; 419 } 420 421 bool hasSwap() const { 422 return GFX9Insts; 423 } 424 425 bool hasScalarPackInsts() const { 426 return GFX9Insts; 427 } 428 429 bool hasScalarMulHiInsts() const { 430 return GFX9Insts; 431 } 432 433 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; } 434 435 TrapHandlerAbi getTrapHandlerAbi() const { 436 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 437 } 438 439 bool supportsGetDoorbellID() const { 440 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 441 return getGeneration() >= GFX9; 442 } 443 444 /// True if the offset field of DS instructions works as expected. On SI, the 445 /// offset uses a 16-bit adder and does not always wrap properly. 446 bool hasUsableDSOffset() const { 447 return getGeneration() >= SEA_ISLANDS; 448 } 449 450 bool unsafeDSOffsetFoldingEnabled() const { 451 return EnableUnsafeDSOffsetFolding; 452 } 453 454 /// Condition output from div_scale is usable. 455 bool hasUsableDivScaleConditionOutput() const { 456 return getGeneration() != SOUTHERN_ISLANDS; 457 } 458 459 /// Extra wait hazard is needed in some cases before 460 /// s_cbranch_vccnz/s_cbranch_vccz. 461 bool hasReadVCCZBug() const { 462 return getGeneration() <= SEA_ISLANDS; 463 } 464 465 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 466 bool partialVCCWritesUpdateVCCZ() const { 467 return getGeneration() >= GFX10; 468 } 469 470 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 471 /// was written by a VALU instruction. 472 bool hasSMRDReadVALUDefHazard() const { 473 return getGeneration() == SOUTHERN_ISLANDS; 474 } 475 476 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 477 /// SGPR was written by a VALU Instruction. 478 bool hasVMEMReadSGPRVALUDefHazard() const { 479 return getGeneration() >= VOLCANIC_ISLANDS; 480 } 481 482 bool hasRFEHazards() const { 483 return getGeneration() >= VOLCANIC_ISLANDS; 484 } 485 486 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 487 unsigned getSetRegWaitStates() const { 488 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 489 } 490 491 bool dumpCode() const { 492 return DumpCode; 493 } 494 495 /// Return the amount of LDS that can be used that will not restrict the 496 /// occupancy lower than WaveCount. 497 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 498 const Function &) const; 499 500 bool supportsMinMaxDenormModes() const { 501 return getGeneration() >= AMDGPUSubtarget::GFX9; 502 } 503 504 /// \returns If target supports S_DENORM_MODE. 505 bool hasDenormModeInst() const { 506 return getGeneration() >= AMDGPUSubtarget::GFX10; 507 } 508 509 bool useFlatForGlobal() const { 510 return FlatForGlobal; 511 } 512 513 /// \returns If target supports ds_read/write_b128 and user enables generation 514 /// of ds_read/write_b128. 515 bool useDS128() const { 516 return CIInsts && EnableDS128; 517 } 518 519 /// \return If target supports ds_read/write_b96/128. 520 bool hasDS96AndDS128() const { 521 return CIInsts; 522 } 523 524 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 525 bool haveRoundOpsF64() const { 526 return CIInsts; 527 } 528 529 /// \returns If MUBUF instructions always perform range checking, even for 530 /// buffer resources used for private memory access. 531 bool privateMemoryResourceIsRangeChecked() const { 532 return getGeneration() < AMDGPUSubtarget::GFX9; 533 } 534 535 /// \returns If target requires PRT Struct NULL support (zero result registers 536 /// for sparse texture support). 537 bool usePRTStrictNull() const { 538 return EnablePRTStrictNull; 539 } 540 541 bool hasAutoWaitcntBeforeBarrier() const { 542 return AutoWaitcntBeforeBarrier; 543 } 544 545 /// \returns true if the target supports backing off of s_barrier instructions 546 /// when an exception is raised. 547 bool supportsBackOffBarrier() const { 548 return BackOffBarrier; 549 } 550 551 bool hasUnalignedBufferAccess() const { 552 return UnalignedBufferAccess; 553 } 554 555 bool hasUnalignedBufferAccessEnabled() const { 556 return UnalignedBufferAccess && UnalignedAccessMode; 557 } 558 559 bool hasUnalignedDSAccess() const { 560 return UnalignedDSAccess; 561 } 562 563 bool hasUnalignedDSAccessEnabled() const { 564 return UnalignedDSAccess && UnalignedAccessMode; 565 } 566 567 bool hasUnalignedScratchAccess() const { 568 return UnalignedScratchAccess; 569 } 570 571 bool hasUnalignedAccessMode() const { 572 return UnalignedAccessMode; 573 } 574 575 bool hasApertureRegs() const { 576 return HasApertureRegs; 577 } 578 579 bool isTrapHandlerEnabled() const { 580 return TrapHandler; 581 } 582 583 bool isXNACKEnabled() const { 584 return TargetID.isXnackOnOrAny(); 585 } 586 587 bool isTgSplitEnabled() const { 588 return EnableTgSplit; 589 } 590 591 bool isCuModeEnabled() const { 592 return EnableCuMode; 593 } 594 595 bool hasFlatAddressSpace() const { 596 return FlatAddressSpace; 597 } 598 599 bool hasFlatScrRegister() const { 600 return hasFlatAddressSpace(); 601 } 602 603 bool hasFlatInstOffsets() const { 604 return FlatInstOffsets; 605 } 606 607 bool hasFlatGlobalInsts() const { 608 return FlatGlobalInsts; 609 } 610 611 bool hasFlatScratchInsts() const { 612 return FlatScratchInsts; 613 } 614 615 // Check if target supports ST addressing mode with FLAT scratch instructions. 616 // The ST addressing mode means no registers are used, either VGPR or SGPR, 617 // but only immediate offset is swizzled and added to the FLAT scratch base. 618 bool hasFlatScratchSTMode() const { 619 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 620 } 621 622 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 623 624 bool hasScalarFlatScratchInsts() const { 625 return ScalarFlatScratchInsts; 626 } 627 628 bool enableFlatScratch() const { 629 return flatScratchIsArchitected() || 630 (EnableFlatScratch && hasFlatScratchInsts()); 631 } 632 633 bool hasGlobalAddTidInsts() const { 634 return GFX10_BEncoding; 635 } 636 637 bool hasAtomicCSub() const { 638 return GFX10_BEncoding; 639 } 640 641 bool hasMultiDwordFlatScratchAddressing() const { 642 return getGeneration() >= GFX9; 643 } 644 645 bool hasFlatSegmentOffsetBug() const { 646 return HasFlatSegmentOffsetBug; 647 } 648 649 bool hasFlatLgkmVMemCountInOrder() const { 650 return getGeneration() > GFX9; 651 } 652 653 bool hasD16LoadStore() const { 654 return getGeneration() >= GFX9; 655 } 656 657 bool d16PreservesUnusedBits() const { 658 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 659 } 660 661 bool hasD16Images() const { 662 return getGeneration() >= VOLCANIC_ISLANDS; 663 } 664 665 /// Return if most LDS instructions have an m0 use that require m0 to be 666 /// initialized. 667 bool ldsRequiresM0Init() const { 668 return getGeneration() < GFX9; 669 } 670 671 // True if the hardware rewinds and replays GWS operations if a wave is 672 // preempted. 673 // 674 // If this is false, a GWS operation requires testing if a nack set the 675 // MEM_VIOL bit, and repeating if so. 676 bool hasGWSAutoReplay() const { 677 return getGeneration() >= GFX9; 678 } 679 680 /// \returns if target has ds_gws_sema_release_all instruction. 681 bool hasGWSSemaReleaseAll() const { 682 return CIInsts; 683 } 684 685 /// \returns true if the target has integer add/sub instructions that do not 686 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 687 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 688 /// for saturation. 689 bool hasAddNoCarry() const { 690 return AddNoCarryInsts; 691 } 692 693 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } 694 695 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } 696 697 bool hasUnpackedD16VMem() const { 698 return HasUnpackedD16VMem; 699 } 700 701 // Covers VS/PS/CS graphics shaders 702 bool isMesaGfxShader(const Function &F) const { 703 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 704 } 705 706 bool hasMad64_32() const { 707 return getGeneration() >= SEA_ISLANDS; 708 } 709 710 bool hasSDWAOmod() const { 711 return HasSDWAOmod; 712 } 713 714 bool hasSDWAScalar() const { 715 return HasSDWAScalar; 716 } 717 718 bool hasSDWASdst() const { 719 return HasSDWASdst; 720 } 721 722 bool hasSDWAMac() const { 723 return HasSDWAMac; 724 } 725 726 bool hasSDWAOutModsVOPC() const { 727 return HasSDWAOutModsVOPC; 728 } 729 730 bool hasDLInsts() const { 731 return HasDLInsts; 732 } 733 734 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 735 736 bool hasDot1Insts() const { 737 return HasDot1Insts; 738 } 739 740 bool hasDot2Insts() const { 741 return HasDot2Insts; 742 } 743 744 bool hasDot3Insts() const { 745 return HasDot3Insts; 746 } 747 748 bool hasDot4Insts() const { 749 return HasDot4Insts; 750 } 751 752 bool hasDot5Insts() const { 753 return HasDot5Insts; 754 } 755 756 bool hasDot6Insts() const { 757 return HasDot6Insts; 758 } 759 760 bool hasDot7Insts() const { 761 return HasDot7Insts; 762 } 763 764 bool hasDot8Insts() const { 765 return HasDot8Insts; 766 } 767 768 bool hasDot9Insts() const { 769 return HasDot9Insts; 770 } 771 772 bool hasDot10Insts() const { 773 return HasDot10Insts; 774 } 775 776 bool hasMAIInsts() const { 777 return HasMAIInsts; 778 } 779 780 bool hasFP8Insts() const { 781 return HasFP8Insts; 782 } 783 784 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } 785 786 bool hasPkFmacF16Inst() const { 787 return HasPkFmacF16Inst; 788 } 789 790 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } 791 792 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } 793 794 bool hasAtomicFaddInsts() const { 795 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 796 } 797 798 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 799 800 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 801 802 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { 803 return HasAtomicBufferGlobalPkAddF16NoRtnInsts; 804 } 805 806 bool hasAtomicBufferGlobalPkAddF16Insts() const { 807 return HasAtomicBufferGlobalPkAddF16Insts; 808 } 809 810 bool hasAtomicGlobalPkAddBF16Inst() const { 811 return HasAtomicGlobalPkAddBF16Inst; 812 } 813 814 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 815 816 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } 817 818 bool hasDefaultComponentBroadcast() const { 819 return HasDefaultComponentBroadcast; 820 } 821 822 bool hasNoSdstCMPX() const { 823 return HasNoSdstCMPX; 824 } 825 826 bool hasVscnt() const { 827 return HasVscnt; 828 } 829 830 bool hasGetWaveIdInst() const { 831 return HasGetWaveIdInst; 832 } 833 834 bool hasSMemTimeInst() const { 835 return HasSMemTimeInst; 836 } 837 838 bool hasShaderCyclesRegister() const { 839 return HasShaderCyclesRegister; 840 } 841 842 bool hasShaderCyclesHiLoRegisters() const { 843 return HasShaderCyclesHiLoRegisters; 844 } 845 846 bool hasVOP3Literal() const { 847 return HasVOP3Literal; 848 } 849 850 bool hasNoDataDepHazard() const { 851 return HasNoDataDepHazard; 852 } 853 854 bool vmemWriteNeedsExpWaitcnt() const { 855 return getGeneration() < SEA_ISLANDS; 856 } 857 858 bool hasInstPrefetch() const { 859 return getGeneration() == GFX10 || getGeneration() == GFX11; 860 } 861 862 bool hasPrefetch() const { return GFX12Insts; } 863 864 // Has s_cmpk_* instructions. 865 bool hasSCmpK() const { return getGeneration() < GFX12; } 866 867 // Scratch is allocated in 256 dword per wave blocks for the entire 868 // wavefront. When viewed from the perspective of an arbitrary workitem, this 869 // is 4-byte aligned. 870 // 871 // Only 4-byte alignment is really needed to access anything. Transformations 872 // on the pointer value itself may rely on the alignment / known low bits of 873 // the pointer. Set this to something above the minimum to avoid needing 874 // dynamic realignment in common cases. 875 Align getStackAlignment() const { return Align(16); } 876 877 bool enableMachineScheduler() const override { 878 return true; 879 } 880 881 bool useAA() const override; 882 883 bool enableSubRegLiveness() const override { 884 return true; 885 } 886 887 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 888 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 889 890 // static wrappers 891 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 892 893 // XXX - Why is this here if it isn't in the default pass set? 894 bool enableEarlyIfConversion() const override { 895 return true; 896 } 897 898 void overrideSchedPolicy(MachineSchedPolicy &Policy, 899 unsigned NumRegionInstrs) const override; 900 901 unsigned getMaxNumUserSGPRs() const { 902 return AMDGPU::getMaxNumUserSGPRs(*this); 903 } 904 905 bool hasSMemRealTime() const { 906 return HasSMemRealTime; 907 } 908 909 bool hasMovrel() const { 910 return HasMovrel; 911 } 912 913 bool hasVGPRIndexMode() const { 914 return HasVGPRIndexMode; 915 } 916 917 bool useVGPRIndexMode() const; 918 919 bool hasScalarCompareEq64() const { 920 return getGeneration() >= VOLCANIC_ISLANDS; 921 } 922 923 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; } 924 925 bool hasScalarStores() const { 926 return HasScalarStores; 927 } 928 929 bool hasScalarAtomics() const { 930 return HasScalarAtomics; 931 } 932 933 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 934 935 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 936 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 937 938 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 939 bool hasPermLane64() const { return getGeneration() >= GFX11; } 940 941 bool hasDPP() const { 942 return HasDPP; 943 } 944 945 bool hasDPPBroadcasts() const { 946 return HasDPP && getGeneration() < GFX10; 947 } 948 949 bool hasDPPWavefrontShifts() const { 950 return HasDPP && getGeneration() < GFX10; 951 } 952 953 bool hasDPP8() const { 954 return HasDPP8; 955 } 956 957 bool hasDPALU_DPP() const { 958 return HasDPALU_DPP; 959 } 960 961 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } 962 963 bool hasPackedFP32Ops() const { 964 return HasPackedFP32Ops; 965 } 966 967 // Has V_PK_MOV_B32 opcode 968 bool hasPkMovB32() const { 969 return GFX90AInsts; 970 } 971 972 bool hasFmaakFmamkF32Insts() const { 973 return getGeneration() >= GFX10 || hasGFX940Insts(); 974 } 975 976 bool hasImageInsts() const { 977 return HasImageInsts; 978 } 979 980 bool hasExtendedImageInsts() const { 981 return HasExtendedImageInsts; 982 } 983 984 bool hasR128A16() const { 985 return HasR128A16; 986 } 987 988 bool hasA16() const { return HasA16; } 989 990 bool hasG16() const { return HasG16; } 991 992 bool hasOffset3fBug() const { 993 return HasOffset3fBug; 994 } 995 996 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 997 998 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 999 1000 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 1001 1002 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } 1003 1004 bool hasNSAEncoding() const { return HasNSAEncoding; } 1005 1006 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } 1007 1008 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } 1009 1010 unsigned getNSAMaxSize(bool HasSampler = false) const { 1011 return AMDGPU::getNSAMaxSize(*this, HasSampler); 1012 } 1013 1014 bool hasGFX10_AEncoding() const { 1015 return GFX10_AEncoding; 1016 } 1017 1018 bool hasGFX10_BEncoding() const { 1019 return GFX10_BEncoding; 1020 } 1021 1022 bool hasGFX10_3Insts() const { 1023 return GFX10_3Insts; 1024 } 1025 1026 bool hasMadF16() const; 1027 1028 bool hasMovB64() const { return GFX940Insts; } 1029 1030 bool hasLshlAddB64() const { return GFX940Insts; } 1031 1032 bool enableSIScheduler() const { 1033 return EnableSIScheduler; 1034 } 1035 1036 bool loadStoreOptEnabled() const { 1037 return EnableLoadStoreOpt; 1038 } 1039 1040 bool hasSGPRInitBug() const { 1041 return SGPRInitBug; 1042 } 1043 1044 bool hasUserSGPRInit16Bug() const { 1045 return UserSGPRInit16Bug && isWave32(); 1046 } 1047 1048 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 1049 1050 bool hasNegativeUnalignedScratchOffsetBug() const { 1051 return NegativeUnalignedScratchOffsetBug; 1052 } 1053 1054 bool hasMFMAInlineLiteralBug() const { 1055 return HasMFMAInlineLiteralBug; 1056 } 1057 1058 bool has12DWordStoreHazard() const { 1059 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1060 } 1061 1062 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1063 bool hasDwordx3LoadStores() const { 1064 return CIInsts; 1065 } 1066 1067 bool hasReadM0MovRelInterpHazard() const { 1068 return getGeneration() == AMDGPUSubtarget::GFX9; 1069 } 1070 1071 bool hasReadM0SendMsgHazard() const { 1072 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1073 getGeneration() <= AMDGPUSubtarget::GFX9; 1074 } 1075 1076 bool hasReadM0LdsDmaHazard() const { 1077 return getGeneration() == AMDGPUSubtarget::GFX9; 1078 } 1079 1080 bool hasReadM0LdsDirectHazard() const { 1081 return getGeneration() == AMDGPUSubtarget::GFX9; 1082 } 1083 1084 bool hasVcmpxPermlaneHazard() const { 1085 return HasVcmpxPermlaneHazard; 1086 } 1087 1088 bool hasVMEMtoScalarWriteHazard() const { 1089 return HasVMEMtoScalarWriteHazard; 1090 } 1091 1092 bool hasSMEMtoVectorWriteHazard() const { 1093 return HasSMEMtoVectorWriteHazard; 1094 } 1095 1096 bool hasLDSMisalignedBug() const { 1097 return LDSMisalignedBug && !EnableCuMode; 1098 } 1099 1100 bool hasInstFwdPrefetchBug() const { 1101 return HasInstFwdPrefetchBug; 1102 } 1103 1104 bool hasVcmpxExecWARHazard() const { 1105 return HasVcmpxExecWARHazard; 1106 } 1107 1108 bool hasLdsBranchVmemWARHazard() const { 1109 return HasLdsBranchVmemWARHazard; 1110 } 1111 1112 // Shift amount of a 64 bit shift cannot be a highest allocated register 1113 // if also at the end of the allocation block. 1114 bool hasShift64HighRegBug() const { 1115 return GFX90AInsts && !GFX940Insts; 1116 } 1117 1118 // Has one cycle hazard on transcendental instruction feeding a 1119 // non transcendental VALU. 1120 bool hasTransForwardingHazard() const { return GFX940Insts; } 1121 1122 // Has one cycle hazard on a VALU instruction partially writing dst with 1123 // a shift of result bits feeding another VALU instruction. 1124 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1125 1126 // Cannot use op_sel with v_dot instructions. 1127 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } 1128 1129 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1130 bool hasVDecCoExecHazard() const { 1131 return GFX940Insts; 1132 } 1133 1134 bool hasNSAtoVMEMBug() const { 1135 return HasNSAtoVMEMBug; 1136 } 1137 1138 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1139 1140 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1141 1142 bool hasGFX90AInsts() const { return GFX90AInsts; } 1143 1144 bool hasFPAtomicToDenormModeHazard() const { 1145 return getGeneration() == GFX10; 1146 } 1147 1148 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1149 1150 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1151 1152 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; } 1153 1154 bool hasVALUPartialForwardingHazard() const { 1155 return getGeneration() == GFX11; 1156 } 1157 1158 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1159 1160 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } 1161 1162 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } 1163 1164 /// Return if operations acting on VGPR tuples require even alignment. 1165 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1166 1167 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1168 bool hasSPackHL() const { return GFX11Insts; } 1169 1170 /// Return true if the target's EXP instruction has the COMPR flag, which 1171 /// affects the meaning of the EN (enable) bits. 1172 bool hasCompressedExport() const { return !GFX11Insts; } 1173 1174 /// Return true if the target's EXP instruction supports the NULL export 1175 /// target. 1176 bool hasNullExportTarget() const { return !GFX11Insts; } 1177 1178 bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; } 1179 1180 bool hasVOPDInsts() const { return HasVOPDInsts; } 1181 1182 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1183 1184 /// Return true if the target has the S_DELAY_ALU instruction. 1185 bool hasDelayAlu() const { return GFX11Insts; } 1186 1187 bool hasPackedTID() const { return HasPackedTID; } 1188 1189 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1190 // hasGFX90AInsts is also true. 1191 bool hasGFX940Insts() const { return GFX940Insts; } 1192 1193 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } 1194 1195 bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; } 1196 1197 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } 1198 1199 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } 1200 1201 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt 1202 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. 1203 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } 1204 1205 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1206 /// SGPRs 1207 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1208 1209 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1210 /// VGPRs 1211 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1212 1213 /// Return occupancy for the given function. Used LDS and a number of 1214 /// registers if provided. 1215 /// Note, occupancy can be affected by the scratch allocation as well, but 1216 /// we do not have enough information to compute it. 1217 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1218 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1219 1220 /// \returns true if the flat_scratch register should be initialized with the 1221 /// pointer to the wave's scratch memory rather than a size and offset. 1222 bool flatScratchIsPointer() const { 1223 return getGeneration() >= AMDGPUSubtarget::GFX9; 1224 } 1225 1226 /// \returns true if the flat_scratch register is initialized by the HW. 1227 /// In this case it is readonly. 1228 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1229 1230 /// \returns true if the architected SGPRs are enabled. 1231 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } 1232 1233 /// \returns true if Global Data Share is supported. 1234 bool hasGDS() const { return HasGDS; } 1235 1236 /// \returns true if Global Wave Sync is supported. 1237 bool hasGWS() const { return HasGWS; } 1238 1239 /// \returns true if the machine has merged shaders in which s0-s7 are 1240 /// reserved by the hardware and user SGPRs start at s8 1241 bool hasMergedShaders() const { 1242 return getGeneration() >= GFX9; 1243 } 1244 1245 // \returns true if the target supports the pre-NGG legacy geometry path. 1246 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1247 1248 // \returns true if preloading kernel arguments is supported. 1249 bool hasKernargPreload() const { return KernargPreload; } 1250 1251 // \returns true if we need to generate backwards compatible code when 1252 // preloading kernel arguments. 1253 bool needsKernargPreloadBackwardsCompatibility() const { 1254 return hasKernargPreload() && !hasGFX940Insts(); 1255 } 1256 1257 // \returns true if the target has split barriers feature 1258 bool hasSplitBarriers() const { return getGeneration() >= GFX12; } 1259 1260 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. 1261 bool hasCvtFP8VOP1Bug() const { return true; } 1262 1263 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a 1264 // no-return form. 1265 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; } 1266 1267 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit 1268 bool hasDX10ClampMode() const { return getGeneration() < GFX12; } 1269 1270 // \returns true if the target has IEEE kernel descriptor mode bit 1271 bool hasIEEEMode() const { return getGeneration() < GFX12; } 1272 1273 // \returns true if the target has IEEE fminimum/fmaximum instructions 1274 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; } 1275 1276 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit 1277 bool hasRrWGMode() const { return getGeneration() >= GFX12; } 1278 1279 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative 1280 /// values. 1281 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } 1282 1283 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead 1284 // of sign-extending. 1285 bool hasGetPCZeroExtension() const { return GFX12Insts; } 1286 1287 /// \returns SGPR allocation granularity supported by the subtarget. 1288 unsigned getSGPRAllocGranule() const { 1289 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1290 } 1291 1292 /// \returns SGPR encoding granularity supported by the subtarget. 1293 unsigned getSGPREncodingGranule() const { 1294 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1295 } 1296 1297 /// \returns Total number of SGPRs supported by the subtarget. 1298 unsigned getTotalNumSGPRs() const { 1299 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1300 } 1301 1302 /// \returns Addressable number of SGPRs supported by the subtarget. 1303 unsigned getAddressableNumSGPRs() const { 1304 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1305 } 1306 1307 /// \returns Minimum number of SGPRs that meets the given number of waves per 1308 /// execution unit requirement supported by the subtarget. 1309 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1310 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1311 } 1312 1313 /// \returns Maximum number of SGPRs that meets the given number of waves per 1314 /// execution unit requirement supported by the subtarget. 1315 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1316 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1317 } 1318 1319 /// \returns Reserved number of SGPRs. This is common 1320 /// utility function called by MachineFunction and 1321 /// Function variants of getReservedNumSGPRs. 1322 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1323 /// \returns Reserved number of SGPRs for given machine function \p MF. 1324 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1325 1326 /// \returns Reserved number of SGPRs for given function \p F. 1327 unsigned getReservedNumSGPRs(const Function &F) const; 1328 1329 /// \returns max num SGPRs. This is the common utility 1330 /// function called by MachineFunction and Function 1331 /// variants of getMaxNumSGPRs. 1332 unsigned getBaseMaxNumSGPRs(const Function &F, 1333 std::pair<unsigned, unsigned> WavesPerEU, 1334 unsigned PreloadedSGPRs, 1335 unsigned ReservedNumSGPRs) const; 1336 1337 /// \returns Maximum number of SGPRs that meets number of waves per execution 1338 /// unit requirement for function \p MF, or number of SGPRs explicitly 1339 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1340 /// 1341 /// \returns Value that meets number of waves per execution unit requirement 1342 /// if explicitly requested value cannot be converted to integer, violates 1343 /// subtarget's specifications, or does not meet number of waves per execution 1344 /// unit requirement. 1345 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1346 1347 /// \returns Maximum number of SGPRs that meets number of waves per execution 1348 /// unit requirement for function \p F, or number of SGPRs explicitly 1349 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1350 /// 1351 /// \returns Value that meets number of waves per execution unit requirement 1352 /// if explicitly requested value cannot be converted to integer, violates 1353 /// subtarget's specifications, or does not meet number of waves per execution 1354 /// unit requirement. 1355 unsigned getMaxNumSGPRs(const Function &F) const; 1356 1357 /// \returns VGPR allocation granularity supported by the subtarget. 1358 unsigned getVGPRAllocGranule() const { 1359 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1360 } 1361 1362 /// \returns VGPR encoding granularity supported by the subtarget. 1363 unsigned getVGPREncodingGranule() const { 1364 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1365 } 1366 1367 /// \returns Total number of VGPRs supported by the subtarget. 1368 unsigned getTotalNumVGPRs() const { 1369 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1370 } 1371 1372 /// \returns Addressable number of VGPRs supported by the subtarget. 1373 unsigned getAddressableNumVGPRs() const { 1374 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1375 } 1376 1377 /// \returns the minimum number of VGPRs that will prevent achieving more than 1378 /// the specified number of waves \p WavesPerEU. 1379 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1380 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1381 } 1382 1383 /// \returns the maximum number of VGPRs that can be used and still achieved 1384 /// at least the specified number of waves \p WavesPerEU. 1385 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1386 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1387 } 1388 1389 /// \returns max num VGPRs. This is the common utility function 1390 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1391 unsigned getBaseMaxNumVGPRs(const Function &F, 1392 std::pair<unsigned, unsigned> WavesPerEU) const; 1393 /// \returns Maximum number of VGPRs that meets number of waves per execution 1394 /// unit requirement for function \p F, or number of VGPRs explicitly 1395 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1396 /// 1397 /// \returns Value that meets number of waves per execution unit requirement 1398 /// if explicitly requested value cannot be converted to integer, violates 1399 /// subtarget's specifications, or does not meet number of waves per execution 1400 /// unit requirement. 1401 unsigned getMaxNumVGPRs(const Function &F) const; 1402 1403 unsigned getMaxNumAGPRs(const Function &F) const { 1404 return getMaxNumVGPRs(F); 1405 } 1406 1407 /// \returns Maximum number of VGPRs that meets number of waves per execution 1408 /// unit requirement for function \p MF, or number of VGPRs explicitly 1409 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1410 /// 1411 /// \returns Value that meets number of waves per execution unit requirement 1412 /// if explicitly requested value cannot be converted to integer, violates 1413 /// subtarget's specifications, or does not meet number of waves per execution 1414 /// unit requirement. 1415 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1416 1417 void getPostRAMutations( 1418 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1419 const override; 1420 1421 std::unique_ptr<ScheduleDAGMutation> 1422 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1423 1424 bool isWave32() const { 1425 return getWavefrontSize() == 32; 1426 } 1427 1428 bool isWave64() const { 1429 return getWavefrontSize() == 64; 1430 } 1431 1432 const TargetRegisterClass *getBoolRC() const { 1433 return getRegisterInfo()->getBoolRC(); 1434 } 1435 1436 /// \returns Maximum number of work groups per compute unit supported by the 1437 /// subtarget and limited by given \p FlatWorkGroupSize. 1438 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1439 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1440 } 1441 1442 /// \returns Minimum flat work group size supported by the subtarget. 1443 unsigned getMinFlatWorkGroupSize() const override { 1444 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1445 } 1446 1447 /// \returns Maximum flat work group size supported by the subtarget. 1448 unsigned getMaxFlatWorkGroupSize() const override { 1449 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1450 } 1451 1452 /// \returns Number of waves per execution unit required to support the given 1453 /// \p FlatWorkGroupSize. 1454 unsigned 1455 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1456 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1457 } 1458 1459 /// \returns Minimum number of waves per execution unit supported by the 1460 /// subtarget. 1461 unsigned getMinWavesPerEU() const override { 1462 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1463 } 1464 1465 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1466 SDep &Dep) const override; 1467 1468 // \returns true if it's beneficial on this subtarget for the scheduler to 1469 // cluster stores as well as loads. 1470 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1471 1472 // \returns the number of address arguments from which to enable MIMG NSA 1473 // on supported architectures. 1474 unsigned getNSAThreshold(const MachineFunction &MF) const; 1475 1476 // \returns true if the subtarget has a hazard requiring an "s_nop 0" 1477 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". 1478 bool requiresNopBeforeDeallocVGPRs() const { 1479 // Currently all targets that support the dealloc VGPRs message also require 1480 // the nop. 1481 return true; 1482 } 1483 }; 1484 1485 class GCNUserSGPRUsageInfo { 1486 public: 1487 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } 1488 1489 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } 1490 1491 bool hasDispatchPtr() const { return DispatchPtr; } 1492 1493 bool hasQueuePtr() const { return QueuePtr; } 1494 1495 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } 1496 1497 bool hasDispatchID() const { return DispatchID; } 1498 1499 bool hasFlatScratchInit() const { return FlatScratchInit; } 1500 1501 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } 1502 1503 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } 1504 1505 unsigned getNumFreeUserSGPRs(); 1506 1507 void allocKernargPreloadSGPRs(unsigned NumSGPRs); 1508 1509 enum UserSGPRID : unsigned { 1510 ImplicitBufferPtrID = 0, 1511 PrivateSegmentBufferID = 1, 1512 DispatchPtrID = 2, 1513 QueuePtrID = 3, 1514 KernargSegmentPtrID = 4, 1515 DispatchIdID = 5, 1516 FlatScratchInitID = 6, 1517 PrivateSegmentSizeID = 7 1518 }; 1519 1520 // Returns the size in number of SGPRs for preload user SGPR field. 1521 static unsigned getNumUserSGPRForField(UserSGPRID ID) { 1522 switch (ID) { 1523 case ImplicitBufferPtrID: 1524 return 2; 1525 case PrivateSegmentBufferID: 1526 return 4; 1527 case DispatchPtrID: 1528 return 2; 1529 case QueuePtrID: 1530 return 2; 1531 case KernargSegmentPtrID: 1532 return 2; 1533 case DispatchIdID: 1534 return 2; 1535 case FlatScratchInitID: 1536 return 2; 1537 case PrivateSegmentSizeID: 1538 return 1; 1539 } 1540 llvm_unreachable("Unknown UserSGPRID."); 1541 } 1542 1543 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); 1544 1545 private: 1546 const GCNSubtarget &ST; 1547 1548 // Private memory buffer 1549 // Compute directly in sgpr[0:1] 1550 // Other shaders indirect 64-bits at sgpr[0:1] 1551 bool ImplicitBufferPtr = false; 1552 1553 bool PrivateSegmentBuffer = false; 1554 1555 bool DispatchPtr = false; 1556 1557 bool QueuePtr = false; 1558 1559 bool KernargSegmentPtr = false; 1560 1561 bool DispatchID = false; 1562 1563 bool FlatScratchInit = false; 1564 1565 unsigned NumKernargPreloadSGPRs = 0; 1566 1567 unsigned NumUsedUserSGPRs = 0; 1568 }; 1569 1570 } // end namespace llvm 1571 1572 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1573