1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "SIFrameLowering.h" 21 #include "SIISelLowering.h" 22 #include "SIInstrInfo.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 25 #include "llvm/Support/ErrorHandling.h" 26 27 #define GET_SUBTARGETINFO_HEADER 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 namespace llvm { 31 32 class GCNTargetMachine; 33 34 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 35 public AMDGPUSubtarget { 36 public: 37 using AMDGPUSubtarget::getMaxWavesPerEU; 38 39 // Following 2 enums are documented at: 40 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 41 enum class TrapHandlerAbi { 42 NONE = 0x00, 43 AMDHSA = 0x01, 44 }; 45 46 enum class TrapID { 47 LLVMAMDHSATrap = 0x02, 48 LLVMAMDHSADebugTrap = 0x03, 49 }; 50 51 private: 52 /// GlobalISel related APIs. 53 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 54 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 55 std::unique_ptr<InstructionSelector> InstSelector; 56 std::unique_ptr<LegalizerInfo> Legalizer; 57 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; 58 59 protected: 60 // Basic subtarget description. 61 Triple TargetTriple; 62 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 63 unsigned Gen = INVALID; 64 InstrItineraryData InstrItins; 65 int LDSBankCount = 0; 66 unsigned MaxPrivateElementSize = 0; 67 68 // Possibly statically set by tablegen, but may want to be overridden. 69 bool FastDenormalF32 = false; 70 bool HalfRate64Ops = false; 71 bool FullRate64Ops = false; 72 73 // Dynamically set bits that enable features. 74 bool FlatForGlobal = false; 75 bool AutoWaitcntBeforeBarrier = false; 76 bool BackOffBarrier = false; 77 bool UnalignedScratchAccess = false; 78 bool UnalignedAccessMode = false; 79 bool HasApertureRegs = false; 80 bool SupportsXNACK = false; 81 bool KernargPreload = false; 82 83 // This should not be used directly. 'TargetID' tracks the dynamic settings 84 // for XNACK. 85 bool EnableXNACK = false; 86 87 bool EnableTgSplit = false; 88 bool EnableCuMode = false; 89 bool TrapHandler = false; 90 91 // Used as options. 92 bool EnableLoadStoreOpt = false; 93 bool EnableUnsafeDSOffsetFolding = false; 94 bool EnableSIScheduler = false; 95 bool EnableDS128 = false; 96 bool EnablePRTStrictNull = false; 97 bool DumpCode = false; 98 99 // Subtarget statically properties set by tablegen 100 bool FP64 = false; 101 bool FMA = false; 102 bool MIMG_R128 = false; 103 bool CIInsts = false; 104 bool GFX8Insts = false; 105 bool GFX9Insts = false; 106 bool GFX90AInsts = false; 107 bool GFX940Insts = false; 108 bool GFX10Insts = false; 109 bool GFX11Insts = false; 110 bool GFX12Insts = false; 111 bool GFX10_3Insts = false; 112 bool GFX7GFX8GFX9Insts = false; 113 bool SGPRInitBug = false; 114 bool UserSGPRInit16Bug = false; 115 bool NegativeScratchOffsetBug = false; 116 bool NegativeUnalignedScratchOffsetBug = false; 117 bool HasSMemRealTime = false; 118 bool HasIntClamp = false; 119 bool HasFmaMixInsts = false; 120 bool HasMovrel = false; 121 bool HasVGPRIndexMode = false; 122 bool HasScalarDwordx3Loads = false; 123 bool HasScalarStores = false; 124 bool HasScalarAtomics = false; 125 bool HasSDWAOmod = false; 126 bool HasSDWAScalar = false; 127 bool HasSDWASdst = false; 128 bool HasSDWAMac = false; 129 bool HasSDWAOutModsVOPC = false; 130 bool HasDPP = false; 131 bool HasDPP8 = false; 132 bool HasDPALU_DPP = false; 133 bool HasDPPSrc1SGPR = false; 134 bool HasPackedFP32Ops = false; 135 bool HasImageInsts = false; 136 bool HasExtendedImageInsts = false; 137 bool HasR128A16 = false; 138 bool HasA16 = false; 139 bool HasG16 = false; 140 bool HasNSAEncoding = false; 141 bool HasPartialNSAEncoding = false; 142 bool GFX10_AEncoding = false; 143 bool GFX10_BEncoding = false; 144 bool HasDLInsts = false; 145 bool HasFmacF64Inst = false; 146 bool HasDot1Insts = false; 147 bool HasDot2Insts = false; 148 bool HasDot3Insts = false; 149 bool HasDot4Insts = false; 150 bool HasDot5Insts = false; 151 bool HasDot6Insts = false; 152 bool HasDot7Insts = false; 153 bool HasDot8Insts = false; 154 bool HasDot9Insts = false; 155 bool HasDot10Insts = false; 156 bool HasMAIInsts = false; 157 bool HasFP8Insts = false; 158 bool HasPkFmacF16Inst = false; 159 bool HasAtomicDsPkAdd16Insts = false; 160 bool HasAtomicFlatPkAdd16Insts = false; 161 bool HasAtomicFaddRtnInsts = false; 162 bool HasAtomicFaddNoRtnInsts = false; 163 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; 164 bool HasAtomicBufferGlobalPkAddF16Insts = false; 165 bool HasAtomicCSubNoRtnInsts = false; 166 bool HasAtomicGlobalPkAddBF16Inst = false; 167 bool HasFlatAtomicFaddF32Inst = false; 168 bool SupportsSRAMECC = false; 169 170 // This should not be used directly. 'TargetID' tracks the dynamic settings 171 // for SRAMECC. 172 bool EnableSRAMECC = false; 173 174 bool HasNoSdstCMPX = false; 175 bool HasVscnt = false; 176 bool HasGetWaveIdInst = false; 177 bool HasSMemTimeInst = false; 178 bool HasShaderCyclesRegister = false; 179 bool HasShaderCyclesHiLoRegisters = false; 180 bool HasVOP3Literal = false; 181 bool HasNoDataDepHazard = false; 182 bool FlatAddressSpace = false; 183 bool FlatInstOffsets = false; 184 bool FlatGlobalInsts = false; 185 bool FlatScratchInsts = false; 186 bool ScalarFlatScratchInsts = false; 187 bool HasArchitectedFlatScratch = false; 188 bool EnableFlatScratch = false; 189 bool HasArchitectedSGPRs = false; 190 bool HasGDS = false; 191 bool HasGWS = false; 192 bool AddNoCarryInsts = false; 193 bool HasUnpackedD16VMem = false; 194 bool LDSMisalignedBug = false; 195 bool HasMFMAInlineLiteralBug = false; 196 bool UnalignedBufferAccess = false; 197 bool UnalignedDSAccess = false; 198 bool HasPackedTID = false; 199 bool ScalarizeGlobal = false; 200 bool HasSALUFloatInsts = false; 201 bool HasVGPRSingleUseHintInsts = false; 202 bool HasPseudoScalarTrans = false; 203 bool HasRestrictedSOffset = false; 204 205 bool HasVcmpxPermlaneHazard = false; 206 bool HasVMEMtoScalarWriteHazard = false; 207 bool HasSMEMtoVectorWriteHazard = false; 208 bool HasInstFwdPrefetchBug = false; 209 bool HasVcmpxExecWARHazard = false; 210 bool HasLdsBranchVmemWARHazard = false; 211 bool HasNSAtoVMEMBug = false; 212 bool HasNSAClauseBug = false; 213 bool HasOffset3fBug = false; 214 bool HasFlatSegmentOffsetBug = false; 215 bool HasImageStoreD16Bug = false; 216 bool HasImageGather4D16Bug = false; 217 bool HasMSAALoadDstSelBug = false; 218 bool HasGFX11FullVGPRs = false; 219 bool HasMADIntraFwdBug = false; 220 bool HasVOPDInsts = false; 221 bool HasVALUTransUseHazard = false; 222 bool HasForceStoreSC0SC1 = false; 223 224 // Dummy feature to use for assembler in tablegen. 225 bool FeatureDisable = false; 226 227 SelectionDAGTargetInfo TSInfo; 228 private: 229 SIInstrInfo InstrInfo; 230 SITargetLowering TLInfo; 231 SIFrameLowering FrameLowering; 232 233 public: 234 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 235 const GCNTargetMachine &TM); 236 ~GCNSubtarget() override; 237 238 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 239 StringRef GPU, StringRef FS); 240 241 const SIInstrInfo *getInstrInfo() const override { 242 return &InstrInfo; 243 } 244 245 const SIFrameLowering *getFrameLowering() const override { 246 return &FrameLowering; 247 } 248 249 const SITargetLowering *getTargetLowering() const override { 250 return &TLInfo; 251 } 252 253 const SIRegisterInfo *getRegisterInfo() const override { 254 return &InstrInfo.getRegisterInfo(); 255 } 256 257 const CallLowering *getCallLowering() const override { 258 return CallLoweringInfo.get(); 259 } 260 261 const InlineAsmLowering *getInlineAsmLowering() const override { 262 return InlineAsmLoweringInfo.get(); 263 } 264 265 InstructionSelector *getInstructionSelector() const override { 266 return InstSelector.get(); 267 } 268 269 const LegalizerInfo *getLegalizerInfo() const override { 270 return Legalizer.get(); 271 } 272 273 const AMDGPURegisterBankInfo *getRegBankInfo() const override { 274 return RegBankInfo.get(); 275 } 276 277 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 278 return TargetID; 279 } 280 281 // Nothing implemented, just prevent crashes on use. 282 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 283 return &TSInfo; 284 } 285 286 const InstrItineraryData *getInstrItineraryData() const override { 287 return &InstrItins; 288 } 289 290 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 291 292 Generation getGeneration() const { 293 return (Generation)Gen; 294 } 295 296 unsigned getMaxWaveScratchSize() const { 297 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 298 if (getGeneration() < GFX11) { 299 // 13-bit field in units of 256-dword. 300 return (256 * 4) * ((1 << 13) - 1); 301 } 302 // 15-bit field in units of 64-dword. 303 return (64 * 4) * ((1 << 15) - 1); 304 } 305 306 /// Return the number of high bits known to be zero for a frame index. 307 unsigned getKnownHighZeroBitsForFrameIndex() const { 308 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 309 } 310 311 int getLDSBankCount() const { 312 return LDSBankCount; 313 } 314 315 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 316 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 317 } 318 319 unsigned getConstantBusLimit(unsigned Opcode) const; 320 321 /// Returns if the result of this instruction with a 16-bit result returned in 322 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 323 /// the original value. 324 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 325 326 bool supportsWGP() const { return getGeneration() >= GFX10; } 327 328 bool hasIntClamp() const { 329 return HasIntClamp; 330 } 331 332 bool hasFP64() const { 333 return FP64; 334 } 335 336 bool hasMIMG_R128() const { 337 return MIMG_R128; 338 } 339 340 bool hasHWFP64() const { 341 return FP64; 342 } 343 344 bool hasHalfRate64Ops() const { 345 return HalfRate64Ops; 346 } 347 348 bool hasFullRate64Ops() const { 349 return FullRate64Ops; 350 } 351 352 bool hasAddr64() const { 353 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 354 } 355 356 bool hasFlat() const { 357 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 358 } 359 360 // Return true if the target only has the reverse operand versions of VALU 361 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 362 bool hasOnlyRevVALUShifts() const { 363 return getGeneration() >= VOLCANIC_ISLANDS; 364 } 365 366 bool hasFractBug() const { 367 return getGeneration() == SOUTHERN_ISLANDS; 368 } 369 370 bool hasBFE() const { 371 return true; 372 } 373 374 bool hasBFI() const { 375 return true; 376 } 377 378 bool hasBFM() const { 379 return hasBFE(); 380 } 381 382 bool hasBCNT(unsigned Size) const { 383 return true; 384 } 385 386 bool hasFFBL() const { 387 return true; 388 } 389 390 bool hasFFBH() const { 391 return true; 392 } 393 394 bool hasMed3_16() const { 395 return getGeneration() >= AMDGPUSubtarget::GFX9; 396 } 397 398 bool hasMin3Max3_16() const { 399 return getGeneration() >= AMDGPUSubtarget::GFX9; 400 } 401 402 bool hasFmaMixInsts() const { 403 return HasFmaMixInsts; 404 } 405 406 bool hasCARRY() const { 407 return true; 408 } 409 410 bool hasFMA() const { 411 return FMA; 412 } 413 414 bool hasSwap() const { 415 return GFX9Insts; 416 } 417 418 bool hasScalarPackInsts() const { 419 return GFX9Insts; 420 } 421 422 bool hasScalarMulHiInsts() const { 423 return GFX9Insts; 424 } 425 426 TrapHandlerAbi getTrapHandlerAbi() const { 427 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 428 } 429 430 bool supportsGetDoorbellID() const { 431 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 432 return getGeneration() >= GFX9; 433 } 434 435 /// True if the offset field of DS instructions works as expected. On SI, the 436 /// offset uses a 16-bit adder and does not always wrap properly. 437 bool hasUsableDSOffset() const { 438 return getGeneration() >= SEA_ISLANDS; 439 } 440 441 bool unsafeDSOffsetFoldingEnabled() const { 442 return EnableUnsafeDSOffsetFolding; 443 } 444 445 /// Condition output from div_scale is usable. 446 bool hasUsableDivScaleConditionOutput() const { 447 return getGeneration() != SOUTHERN_ISLANDS; 448 } 449 450 /// Extra wait hazard is needed in some cases before 451 /// s_cbranch_vccnz/s_cbranch_vccz. 452 bool hasReadVCCZBug() const { 453 return getGeneration() <= SEA_ISLANDS; 454 } 455 456 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 457 bool partialVCCWritesUpdateVCCZ() const { 458 return getGeneration() >= GFX10; 459 } 460 461 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 462 /// was written by a VALU instruction. 463 bool hasSMRDReadVALUDefHazard() const { 464 return getGeneration() == SOUTHERN_ISLANDS; 465 } 466 467 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 468 /// SGPR was written by a VALU Instruction. 469 bool hasVMEMReadSGPRVALUDefHazard() const { 470 return getGeneration() >= VOLCANIC_ISLANDS; 471 } 472 473 bool hasRFEHazards() const { 474 return getGeneration() >= VOLCANIC_ISLANDS; 475 } 476 477 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 478 unsigned getSetRegWaitStates() const { 479 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 480 } 481 482 bool dumpCode() const { 483 return DumpCode; 484 } 485 486 /// Return the amount of LDS that can be used that will not restrict the 487 /// occupancy lower than WaveCount. 488 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 489 const Function &) const; 490 491 bool supportsMinMaxDenormModes() const { 492 return getGeneration() >= AMDGPUSubtarget::GFX9; 493 } 494 495 /// \returns If target supports S_DENORM_MODE. 496 bool hasDenormModeInst() const { 497 return getGeneration() >= AMDGPUSubtarget::GFX10; 498 } 499 500 bool useFlatForGlobal() const { 501 return FlatForGlobal; 502 } 503 504 /// \returns If target supports ds_read/write_b128 and user enables generation 505 /// of ds_read/write_b128. 506 bool useDS128() const { 507 return CIInsts && EnableDS128; 508 } 509 510 /// \return If target supports ds_read/write_b96/128. 511 bool hasDS96AndDS128() const { 512 return CIInsts; 513 } 514 515 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 516 bool haveRoundOpsF64() const { 517 return CIInsts; 518 } 519 520 /// \returns If MUBUF instructions always perform range checking, even for 521 /// buffer resources used for private memory access. 522 bool privateMemoryResourceIsRangeChecked() const { 523 return getGeneration() < AMDGPUSubtarget::GFX9; 524 } 525 526 /// \returns If target requires PRT Struct NULL support (zero result registers 527 /// for sparse texture support). 528 bool usePRTStrictNull() const { 529 return EnablePRTStrictNull; 530 } 531 532 bool hasAutoWaitcntBeforeBarrier() const { 533 return AutoWaitcntBeforeBarrier; 534 } 535 536 /// \returns true if the target supports backing off of s_barrier instructions 537 /// when an exception is raised. 538 bool supportsBackOffBarrier() const { 539 return BackOffBarrier; 540 } 541 542 bool hasUnalignedBufferAccess() const { 543 return UnalignedBufferAccess; 544 } 545 546 bool hasUnalignedBufferAccessEnabled() const { 547 return UnalignedBufferAccess && UnalignedAccessMode; 548 } 549 550 bool hasUnalignedDSAccess() const { 551 return UnalignedDSAccess; 552 } 553 554 bool hasUnalignedDSAccessEnabled() const { 555 return UnalignedDSAccess && UnalignedAccessMode; 556 } 557 558 bool hasUnalignedScratchAccess() const { 559 return UnalignedScratchAccess; 560 } 561 562 bool hasUnalignedAccessMode() const { 563 return UnalignedAccessMode; 564 } 565 566 bool hasApertureRegs() const { 567 return HasApertureRegs; 568 } 569 570 bool isTrapHandlerEnabled() const { 571 return TrapHandler; 572 } 573 574 bool isXNACKEnabled() const { 575 return TargetID.isXnackOnOrAny(); 576 } 577 578 bool isTgSplitEnabled() const { 579 return EnableTgSplit; 580 } 581 582 bool isCuModeEnabled() const { 583 return EnableCuMode; 584 } 585 586 bool hasFlatAddressSpace() const { 587 return FlatAddressSpace; 588 } 589 590 bool hasFlatScrRegister() const { 591 return hasFlatAddressSpace(); 592 } 593 594 bool hasFlatInstOffsets() const { 595 return FlatInstOffsets; 596 } 597 598 bool hasFlatGlobalInsts() const { 599 return FlatGlobalInsts; 600 } 601 602 bool hasFlatScratchInsts() const { 603 return FlatScratchInsts; 604 } 605 606 // Check if target supports ST addressing mode with FLAT scratch instructions. 607 // The ST addressing mode means no registers are used, either VGPR or SGPR, 608 // but only immediate offset is swizzled and added to the FLAT scratch base. 609 bool hasFlatScratchSTMode() const { 610 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 611 } 612 613 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 614 615 bool hasScalarFlatScratchInsts() const { 616 return ScalarFlatScratchInsts; 617 } 618 619 bool enableFlatScratch() const { 620 return flatScratchIsArchitected() || 621 (EnableFlatScratch && hasFlatScratchInsts()); 622 } 623 624 bool hasGlobalAddTidInsts() const { 625 return GFX10_BEncoding; 626 } 627 628 bool hasAtomicCSub() const { 629 return GFX10_BEncoding; 630 } 631 632 bool hasMultiDwordFlatScratchAddressing() const { 633 return getGeneration() >= GFX9; 634 } 635 636 bool hasFlatSegmentOffsetBug() const { 637 return HasFlatSegmentOffsetBug; 638 } 639 640 bool hasFlatLgkmVMemCountInOrder() const { 641 return getGeneration() > GFX9; 642 } 643 644 bool hasD16LoadStore() const { 645 return getGeneration() >= GFX9; 646 } 647 648 bool d16PreservesUnusedBits() const { 649 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 650 } 651 652 bool hasD16Images() const { 653 return getGeneration() >= VOLCANIC_ISLANDS; 654 } 655 656 /// Return if most LDS instructions have an m0 use that require m0 to be 657 /// initialized. 658 bool ldsRequiresM0Init() const { 659 return getGeneration() < GFX9; 660 } 661 662 // True if the hardware rewinds and replays GWS operations if a wave is 663 // preempted. 664 // 665 // If this is false, a GWS operation requires testing if a nack set the 666 // MEM_VIOL bit, and repeating if so. 667 bool hasGWSAutoReplay() const { 668 return getGeneration() >= GFX9; 669 } 670 671 /// \returns if target has ds_gws_sema_release_all instruction. 672 bool hasGWSSemaReleaseAll() const { 673 return CIInsts; 674 } 675 676 /// \returns true if the target has integer add/sub instructions that do not 677 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 678 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 679 /// for saturation. 680 bool hasAddNoCarry() const { 681 return AddNoCarryInsts; 682 } 683 684 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } 685 686 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } 687 688 bool hasUnpackedD16VMem() const { 689 return HasUnpackedD16VMem; 690 } 691 692 // Covers VS/PS/CS graphics shaders 693 bool isMesaGfxShader(const Function &F) const { 694 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 695 } 696 697 bool hasMad64_32() const { 698 return getGeneration() >= SEA_ISLANDS; 699 } 700 701 bool hasSDWAOmod() const { 702 return HasSDWAOmod; 703 } 704 705 bool hasSDWAScalar() const { 706 return HasSDWAScalar; 707 } 708 709 bool hasSDWASdst() const { 710 return HasSDWASdst; 711 } 712 713 bool hasSDWAMac() const { 714 return HasSDWAMac; 715 } 716 717 bool hasSDWAOutModsVOPC() const { 718 return HasSDWAOutModsVOPC; 719 } 720 721 bool hasDLInsts() const { 722 return HasDLInsts; 723 } 724 725 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 726 727 bool hasDot1Insts() const { 728 return HasDot1Insts; 729 } 730 731 bool hasDot2Insts() const { 732 return HasDot2Insts; 733 } 734 735 bool hasDot3Insts() const { 736 return HasDot3Insts; 737 } 738 739 bool hasDot4Insts() const { 740 return HasDot4Insts; 741 } 742 743 bool hasDot5Insts() const { 744 return HasDot5Insts; 745 } 746 747 bool hasDot6Insts() const { 748 return HasDot6Insts; 749 } 750 751 bool hasDot7Insts() const { 752 return HasDot7Insts; 753 } 754 755 bool hasDot8Insts() const { 756 return HasDot8Insts; 757 } 758 759 bool hasDot9Insts() const { 760 return HasDot9Insts; 761 } 762 763 bool hasDot10Insts() const { 764 return HasDot10Insts; 765 } 766 767 bool hasMAIInsts() const { 768 return HasMAIInsts; 769 } 770 771 bool hasFP8Insts() const { 772 return HasFP8Insts; 773 } 774 775 bool hasPkFmacF16Inst() const { 776 return HasPkFmacF16Inst; 777 } 778 779 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } 780 781 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } 782 783 bool hasAtomicFaddInsts() const { 784 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 785 } 786 787 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 788 789 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 790 791 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { 792 return HasAtomicBufferGlobalPkAddF16NoRtnInsts; 793 } 794 795 bool hasAtomicBufferGlobalPkAddF16Insts() const { 796 return HasAtomicBufferGlobalPkAddF16Insts; 797 } 798 799 bool hasAtomicGlobalPkAddBF16Inst() const { 800 return HasAtomicGlobalPkAddBF16Inst; 801 } 802 803 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 804 805 bool hasNoSdstCMPX() const { 806 return HasNoSdstCMPX; 807 } 808 809 bool hasVscnt() const { 810 return HasVscnt; 811 } 812 813 bool hasGetWaveIdInst() const { 814 return HasGetWaveIdInst; 815 } 816 817 bool hasSMemTimeInst() const { 818 return HasSMemTimeInst; 819 } 820 821 bool hasShaderCyclesRegister() const { 822 return HasShaderCyclesRegister; 823 } 824 825 bool hasShaderCyclesHiLoRegisters() const { 826 return HasShaderCyclesHiLoRegisters; 827 } 828 829 bool hasVOP3Literal() const { 830 return HasVOP3Literal; 831 } 832 833 bool hasNoDataDepHazard() const { 834 return HasNoDataDepHazard; 835 } 836 837 bool vmemWriteNeedsExpWaitcnt() const { 838 return getGeneration() < SEA_ISLANDS; 839 } 840 841 bool hasInstPrefetch() const { return getGeneration() >= GFX10; } 842 843 bool hasPrefetch() const { return GFX12Insts; } 844 845 // Has s_cmpk_* instructions. 846 bool hasSCmpK() const { return getGeneration() < GFX12; } 847 848 // Scratch is allocated in 256 dword per wave blocks for the entire 849 // wavefront. When viewed from the perspective of an arbitrary workitem, this 850 // is 4-byte aligned. 851 // 852 // Only 4-byte alignment is really needed to access anything. Transformations 853 // on the pointer value itself may rely on the alignment / known low bits of 854 // the pointer. Set this to something above the minimum to avoid needing 855 // dynamic realignment in common cases. 856 Align getStackAlignment() const { return Align(16); } 857 858 bool enableMachineScheduler() const override { 859 return true; 860 } 861 862 bool useAA() const override; 863 864 bool enableSubRegLiveness() const override { 865 return true; 866 } 867 868 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 869 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 870 871 // static wrappers 872 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 873 874 // XXX - Why is this here if it isn't in the default pass set? 875 bool enableEarlyIfConversion() const override { 876 return true; 877 } 878 879 void overrideSchedPolicy(MachineSchedPolicy &Policy, 880 unsigned NumRegionInstrs) const override; 881 882 unsigned getMaxNumUserSGPRs() const { 883 return AMDGPU::getMaxNumUserSGPRs(*this); 884 } 885 886 bool hasSMemRealTime() const { 887 return HasSMemRealTime; 888 } 889 890 bool hasMovrel() const { 891 return HasMovrel; 892 } 893 894 bool hasVGPRIndexMode() const { 895 return HasVGPRIndexMode; 896 } 897 898 bool useVGPRIndexMode() const; 899 900 bool hasScalarCompareEq64() const { 901 return getGeneration() >= VOLCANIC_ISLANDS; 902 } 903 904 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; } 905 906 bool hasScalarStores() const { 907 return HasScalarStores; 908 } 909 910 bool hasScalarAtomics() const { 911 return HasScalarAtomics; 912 } 913 914 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 915 916 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 917 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 918 919 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 920 bool hasPermLane64() const { return getGeneration() >= GFX11; } 921 922 bool hasDPP() const { 923 return HasDPP; 924 } 925 926 bool hasDPPBroadcasts() const { 927 return HasDPP && getGeneration() < GFX10; 928 } 929 930 bool hasDPPWavefrontShifts() const { 931 return HasDPP && getGeneration() < GFX10; 932 } 933 934 bool hasDPP8() const { 935 return HasDPP8; 936 } 937 938 bool hasDPALU_DPP() const { 939 return HasDPALU_DPP; 940 } 941 942 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } 943 944 bool hasPackedFP32Ops() const { 945 return HasPackedFP32Ops; 946 } 947 948 // Has V_PK_MOV_B32 opcode 949 bool hasPkMovB32() const { 950 return GFX90AInsts; 951 } 952 953 bool hasFmaakFmamkF32Insts() const { 954 return getGeneration() >= GFX10 || hasGFX940Insts(); 955 } 956 957 bool hasImageInsts() const { 958 return HasImageInsts; 959 } 960 961 bool hasExtendedImageInsts() const { 962 return HasExtendedImageInsts; 963 } 964 965 bool hasR128A16() const { 966 return HasR128A16; 967 } 968 969 bool hasA16() const { return HasA16; } 970 971 bool hasG16() const { return HasG16; } 972 973 bool hasOffset3fBug() const { 974 return HasOffset3fBug; 975 } 976 977 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 978 979 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 980 981 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 982 983 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } 984 985 bool hasNSAEncoding() const { return HasNSAEncoding; } 986 987 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } 988 989 unsigned getNSAMaxSize(bool HasSampler = false) const { 990 return AMDGPU::getNSAMaxSize(*this, HasSampler); 991 } 992 993 bool hasGFX10_AEncoding() const { 994 return GFX10_AEncoding; 995 } 996 997 bool hasGFX10_BEncoding() const { 998 return GFX10_BEncoding; 999 } 1000 1001 bool hasGFX10_3Insts() const { 1002 return GFX10_3Insts; 1003 } 1004 1005 bool hasMadF16() const; 1006 1007 bool hasMovB64() const { return GFX940Insts; } 1008 1009 bool hasLshlAddB64() const { return GFX940Insts; } 1010 1011 bool enableSIScheduler() const { 1012 return EnableSIScheduler; 1013 } 1014 1015 bool loadStoreOptEnabled() const { 1016 return EnableLoadStoreOpt; 1017 } 1018 1019 bool hasSGPRInitBug() const { 1020 return SGPRInitBug; 1021 } 1022 1023 bool hasUserSGPRInit16Bug() const { 1024 return UserSGPRInit16Bug && isWave32(); 1025 } 1026 1027 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 1028 1029 bool hasNegativeUnalignedScratchOffsetBug() const { 1030 return NegativeUnalignedScratchOffsetBug; 1031 } 1032 1033 bool hasMFMAInlineLiteralBug() const { 1034 return HasMFMAInlineLiteralBug; 1035 } 1036 1037 bool has12DWordStoreHazard() const { 1038 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1039 } 1040 1041 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1042 bool hasDwordx3LoadStores() const { 1043 return CIInsts; 1044 } 1045 1046 bool hasReadM0MovRelInterpHazard() const { 1047 return getGeneration() == AMDGPUSubtarget::GFX9; 1048 } 1049 1050 bool hasReadM0SendMsgHazard() const { 1051 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1052 getGeneration() <= AMDGPUSubtarget::GFX9; 1053 } 1054 1055 bool hasReadM0LdsDmaHazard() const { 1056 return getGeneration() == AMDGPUSubtarget::GFX9; 1057 } 1058 1059 bool hasReadM0LdsDirectHazard() const { 1060 return getGeneration() == AMDGPUSubtarget::GFX9; 1061 } 1062 1063 bool hasVcmpxPermlaneHazard() const { 1064 return HasVcmpxPermlaneHazard; 1065 } 1066 1067 bool hasVMEMtoScalarWriteHazard() const { 1068 return HasVMEMtoScalarWriteHazard; 1069 } 1070 1071 bool hasSMEMtoVectorWriteHazard() const { 1072 return HasSMEMtoVectorWriteHazard; 1073 } 1074 1075 bool hasLDSMisalignedBug() const { 1076 return LDSMisalignedBug && !EnableCuMode; 1077 } 1078 1079 bool hasInstFwdPrefetchBug() const { 1080 return HasInstFwdPrefetchBug; 1081 } 1082 1083 bool hasVcmpxExecWARHazard() const { 1084 return HasVcmpxExecWARHazard; 1085 } 1086 1087 bool hasLdsBranchVmemWARHazard() const { 1088 return HasLdsBranchVmemWARHazard; 1089 } 1090 1091 // Shift amount of a 64 bit shift cannot be a highest allocated register 1092 // if also at the end of the allocation block. 1093 bool hasShift64HighRegBug() const { 1094 return GFX90AInsts && !GFX940Insts; 1095 } 1096 1097 // Has one cycle hazard on transcendental instruction feeding a 1098 // non transcendental VALU. 1099 bool hasTransForwardingHazard() const { return GFX940Insts; } 1100 1101 // Has one cycle hazard on a VALU instruction partially writing dst with 1102 // a shift of result bits feeding another VALU instruction. 1103 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1104 1105 // Cannot use op_sel with v_dot instructions. 1106 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } 1107 1108 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1109 bool hasVDecCoExecHazard() const { 1110 return GFX940Insts; 1111 } 1112 1113 bool hasNSAtoVMEMBug() const { 1114 return HasNSAtoVMEMBug; 1115 } 1116 1117 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1118 1119 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1120 1121 bool hasGFX90AInsts() const { return GFX90AInsts; } 1122 1123 bool hasFPAtomicToDenormModeHazard() const { 1124 return getGeneration() == GFX10; 1125 } 1126 1127 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1128 1129 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1130 1131 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; } 1132 1133 bool hasVALUPartialForwardingHazard() const { 1134 return getGeneration() >= GFX11; 1135 } 1136 1137 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1138 1139 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } 1140 1141 bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; } 1142 1143 /// Return if operations acting on VGPR tuples require even alignment. 1144 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1145 1146 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1147 bool hasSPackHL() const { return GFX11Insts; } 1148 1149 /// Return true if the target's EXP instruction has the COMPR flag, which 1150 /// affects the meaning of the EN (enable) bits. 1151 bool hasCompressedExport() const { return !GFX11Insts; } 1152 1153 /// Return true if the target's EXP instruction supports the NULL export 1154 /// target. 1155 bool hasNullExportTarget() const { return !GFX11Insts; } 1156 1157 bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; } 1158 1159 bool hasVOPDInsts() const { return HasVOPDInsts; } 1160 1161 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1162 1163 /// Return true if the target has the S_DELAY_ALU instruction. 1164 bool hasDelayAlu() const { return GFX11Insts; } 1165 1166 bool hasPackedTID() const { return HasPackedTID; } 1167 1168 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1169 // hasGFX90AInsts is also true. 1170 bool hasGFX940Insts() const { return GFX940Insts; } 1171 1172 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } 1173 1174 bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; } 1175 1176 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } 1177 1178 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } 1179 1180 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1181 /// SGPRs 1182 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1183 1184 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1185 /// VGPRs 1186 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1187 1188 /// Return occupancy for the given function. Used LDS and a number of 1189 /// registers if provided. 1190 /// Note, occupancy can be affected by the scratch allocation as well, but 1191 /// we do not have enough information to compute it. 1192 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1193 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1194 1195 /// \returns true if the flat_scratch register should be initialized with the 1196 /// pointer to the wave's scratch memory rather than a size and offset. 1197 bool flatScratchIsPointer() const { 1198 return getGeneration() >= AMDGPUSubtarget::GFX9; 1199 } 1200 1201 /// \returns true if the flat_scratch register is initialized by the HW. 1202 /// In this case it is readonly. 1203 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1204 1205 /// \returns true if the architected SGPRs are enabled. 1206 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } 1207 1208 /// \returns true if Global Data Share is supported. 1209 bool hasGDS() const { return HasGDS; } 1210 1211 /// \returns true if Global Wave Sync is supported. 1212 bool hasGWS() const { return HasGWS; } 1213 1214 /// \returns true if the machine has merged shaders in which s0-s7 are 1215 /// reserved by the hardware and user SGPRs start at s8 1216 bool hasMergedShaders() const { 1217 return getGeneration() >= GFX9; 1218 } 1219 1220 // \returns true if the target supports the pre-NGG legacy geometry path. 1221 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1222 1223 // \returns true if preloading kernel arguments is supported. 1224 bool hasKernargPreload() const { return KernargPreload; } 1225 1226 // \returns true if we need to generate backwards compatible code when 1227 // preloading kernel arguments. 1228 bool needsKernargPreloadBackwardsCompatibility() const { 1229 return hasKernargPreload() && !hasGFX940Insts(); 1230 } 1231 1232 // \returns true if the target has split barriers feature 1233 bool hasSplitBarriers() const { return getGeneration() >= GFX12; } 1234 1235 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. 1236 bool hasCvtFP8VOP1Bug() const { return true; } 1237 1238 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a 1239 // no-return form. 1240 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; } 1241 1242 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit 1243 bool hasDX10ClampMode() const { return getGeneration() < GFX12; } 1244 1245 // \returns true if the target has IEEE kernel descriptor mode bit 1246 bool hasIEEEMode() const { return getGeneration() < GFX12; } 1247 1248 // \returns true if the target has IEEE fminimum/fmaximum instructions 1249 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; } 1250 1251 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit 1252 bool hasRrWGMode() const { return getGeneration() >= GFX12; } 1253 1254 /// \returns SGPR allocation granularity supported by the subtarget. 1255 unsigned getSGPRAllocGranule() const { 1256 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1257 } 1258 1259 /// \returns SGPR encoding granularity supported by the subtarget. 1260 unsigned getSGPREncodingGranule() const { 1261 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1262 } 1263 1264 /// \returns Total number of SGPRs supported by the subtarget. 1265 unsigned getTotalNumSGPRs() const { 1266 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1267 } 1268 1269 /// \returns Addressable number of SGPRs supported by the subtarget. 1270 unsigned getAddressableNumSGPRs() const { 1271 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1272 } 1273 1274 /// \returns Minimum number of SGPRs that meets the given number of waves per 1275 /// execution unit requirement supported by the subtarget. 1276 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1277 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1278 } 1279 1280 /// \returns Maximum number of SGPRs that meets the given number of waves per 1281 /// execution unit requirement supported by the subtarget. 1282 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1283 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1284 } 1285 1286 /// \returns Reserved number of SGPRs. This is common 1287 /// utility function called by MachineFunction and 1288 /// Function variants of getReservedNumSGPRs. 1289 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1290 /// \returns Reserved number of SGPRs for given machine function \p MF. 1291 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1292 1293 /// \returns Reserved number of SGPRs for given function \p F. 1294 unsigned getReservedNumSGPRs(const Function &F) const; 1295 1296 /// \returns max num SGPRs. This is the common utility 1297 /// function called by MachineFunction and Function 1298 /// variants of getMaxNumSGPRs. 1299 unsigned getBaseMaxNumSGPRs(const Function &F, 1300 std::pair<unsigned, unsigned> WavesPerEU, 1301 unsigned PreloadedSGPRs, 1302 unsigned ReservedNumSGPRs) const; 1303 1304 /// \returns Maximum number of SGPRs that meets number of waves per execution 1305 /// unit requirement for function \p MF, or number of SGPRs explicitly 1306 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1307 /// 1308 /// \returns Value that meets number of waves per execution unit requirement 1309 /// if explicitly requested value cannot be converted to integer, violates 1310 /// subtarget's specifications, or does not meet number of waves per execution 1311 /// unit requirement. 1312 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1313 1314 /// \returns Maximum number of SGPRs that meets number of waves per execution 1315 /// unit requirement for function \p F, or number of SGPRs explicitly 1316 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1317 /// 1318 /// \returns Value that meets number of waves per execution unit requirement 1319 /// if explicitly requested value cannot be converted to integer, violates 1320 /// subtarget's specifications, or does not meet number of waves per execution 1321 /// unit requirement. 1322 unsigned getMaxNumSGPRs(const Function &F) const; 1323 1324 /// \returns VGPR allocation granularity supported by the subtarget. 1325 unsigned getVGPRAllocGranule() const { 1326 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1327 } 1328 1329 /// \returns VGPR encoding granularity supported by the subtarget. 1330 unsigned getVGPREncodingGranule() const { 1331 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1332 } 1333 1334 /// \returns Total number of VGPRs supported by the subtarget. 1335 unsigned getTotalNumVGPRs() const { 1336 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1337 } 1338 1339 /// \returns Addressable number of VGPRs supported by the subtarget. 1340 unsigned getAddressableNumVGPRs() const { 1341 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1342 } 1343 1344 /// \returns the minimum number of VGPRs that will prevent achieving more than 1345 /// the specified number of waves \p WavesPerEU. 1346 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1347 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1348 } 1349 1350 /// \returns the maximum number of VGPRs that can be used and still achieved 1351 /// at least the specified number of waves \p WavesPerEU. 1352 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1353 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1354 } 1355 1356 /// \returns max num VGPRs. This is the common utility function 1357 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1358 unsigned getBaseMaxNumVGPRs(const Function &F, 1359 std::pair<unsigned, unsigned> WavesPerEU) const; 1360 /// \returns Maximum number of VGPRs that meets number of waves per execution 1361 /// unit requirement for function \p F, or number of VGPRs explicitly 1362 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1363 /// 1364 /// \returns Value that meets number of waves per execution unit requirement 1365 /// if explicitly requested value cannot be converted to integer, violates 1366 /// subtarget's specifications, or does not meet number of waves per execution 1367 /// unit requirement. 1368 unsigned getMaxNumVGPRs(const Function &F) const; 1369 1370 unsigned getMaxNumAGPRs(const Function &F) const { 1371 return getMaxNumVGPRs(F); 1372 } 1373 1374 /// \returns Maximum number of VGPRs that meets number of waves per execution 1375 /// unit requirement for function \p MF, or number of VGPRs explicitly 1376 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1377 /// 1378 /// \returns Value that meets number of waves per execution unit requirement 1379 /// if explicitly requested value cannot be converted to integer, violates 1380 /// subtarget's specifications, or does not meet number of waves per execution 1381 /// unit requirement. 1382 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1383 1384 void getPostRAMutations( 1385 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1386 const override; 1387 1388 std::unique_ptr<ScheduleDAGMutation> 1389 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1390 1391 bool isWave32() const { 1392 return getWavefrontSize() == 32; 1393 } 1394 1395 bool isWave64() const { 1396 return getWavefrontSize() == 64; 1397 } 1398 1399 const TargetRegisterClass *getBoolRC() const { 1400 return getRegisterInfo()->getBoolRC(); 1401 } 1402 1403 /// \returns Maximum number of work groups per compute unit supported by the 1404 /// subtarget and limited by given \p FlatWorkGroupSize. 1405 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1406 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1407 } 1408 1409 /// \returns Minimum flat work group size supported by the subtarget. 1410 unsigned getMinFlatWorkGroupSize() const override { 1411 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1412 } 1413 1414 /// \returns Maximum flat work group size supported by the subtarget. 1415 unsigned getMaxFlatWorkGroupSize() const override { 1416 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1417 } 1418 1419 /// \returns Number of waves per execution unit required to support the given 1420 /// \p FlatWorkGroupSize. 1421 unsigned 1422 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1423 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1424 } 1425 1426 /// \returns Minimum number of waves per execution unit supported by the 1427 /// subtarget. 1428 unsigned getMinWavesPerEU() const override { 1429 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1430 } 1431 1432 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1433 SDep &Dep) const override; 1434 1435 // \returns true if it's beneficial on this subtarget for the scheduler to 1436 // cluster stores as well as loads. 1437 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1438 1439 // \returns the number of address arguments from which to enable MIMG NSA 1440 // on supported architectures. 1441 unsigned getNSAThreshold(const MachineFunction &MF) const; 1442 1443 // \returns true if the subtarget has a hazard requiring an "s_nop 0" 1444 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". 1445 bool requiresNopBeforeDeallocVGPRs() const { 1446 // Currently all targets that support the dealloc VGPRs message also require 1447 // the nop. 1448 return true; 1449 } 1450 }; 1451 1452 class GCNUserSGPRUsageInfo { 1453 public: 1454 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } 1455 1456 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } 1457 1458 bool hasDispatchPtr() const { return DispatchPtr; } 1459 1460 bool hasQueuePtr() const { return QueuePtr; } 1461 1462 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } 1463 1464 bool hasDispatchID() const { return DispatchID; } 1465 1466 bool hasFlatScratchInit() const { return FlatScratchInit; } 1467 1468 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } 1469 1470 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } 1471 1472 unsigned getNumFreeUserSGPRs(); 1473 1474 void allocKernargPreloadSGPRs(unsigned NumSGPRs); 1475 1476 enum UserSGPRID : unsigned { 1477 ImplicitBufferPtrID = 0, 1478 PrivateSegmentBufferID = 1, 1479 DispatchPtrID = 2, 1480 QueuePtrID = 3, 1481 KernargSegmentPtrID = 4, 1482 DispatchIdID = 5, 1483 FlatScratchInitID = 6, 1484 PrivateSegmentSizeID = 7 1485 }; 1486 1487 // Returns the size in number of SGPRs for preload user SGPR field. 1488 static unsigned getNumUserSGPRForField(UserSGPRID ID) { 1489 switch (ID) { 1490 case ImplicitBufferPtrID: 1491 return 2; 1492 case PrivateSegmentBufferID: 1493 return 4; 1494 case DispatchPtrID: 1495 return 2; 1496 case QueuePtrID: 1497 return 2; 1498 case KernargSegmentPtrID: 1499 return 2; 1500 case DispatchIdID: 1501 return 2; 1502 case FlatScratchInitID: 1503 return 2; 1504 case PrivateSegmentSizeID: 1505 return 1; 1506 } 1507 llvm_unreachable("Unknown UserSGPRID."); 1508 } 1509 1510 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); 1511 1512 private: 1513 const GCNSubtarget &ST; 1514 1515 // Private memory buffer 1516 // Compute directly in sgpr[0:1] 1517 // Other shaders indirect 64-bits at sgpr[0:1] 1518 bool ImplicitBufferPtr = false; 1519 1520 bool PrivateSegmentBuffer = false; 1521 1522 bool DispatchPtr = false; 1523 1524 bool QueuePtr = false; 1525 1526 bool KernargSegmentPtr = false; 1527 1528 bool DispatchID = false; 1529 1530 bool FlatScratchInit = false; 1531 1532 unsigned NumKernargPreloadSGPRs = 0; 1533 1534 unsigned NumUsedUserSGPRs = 0; 1535 }; 1536 1537 } // end namespace llvm 1538 1539 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1540