1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "SIFrameLowering.h" 21 #include "SIISelLowering.h" 22 #include "SIInstrInfo.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 25 #include "llvm/Support/ErrorHandling.h" 26 27 #define GET_SUBTARGETINFO_HEADER 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 namespace llvm { 31 32 class GCNTargetMachine; 33 34 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 35 public AMDGPUSubtarget { 36 public: 37 using AMDGPUSubtarget::getMaxWavesPerEU; 38 39 // Following 2 enums are documented at: 40 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 41 enum class TrapHandlerAbi { 42 NONE = 0x00, 43 AMDHSA = 0x01, 44 }; 45 46 enum class TrapID { 47 LLVMAMDHSATrap = 0x02, 48 LLVMAMDHSADebugTrap = 0x03, 49 }; 50 51 private: 52 /// GlobalISel related APIs. 53 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 54 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 55 std::unique_ptr<InstructionSelector> InstSelector; 56 std::unique_ptr<LegalizerInfo> Legalizer; 57 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; 58 59 protected: 60 // Basic subtarget description. 61 Triple TargetTriple; 62 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 63 unsigned Gen = INVALID; 64 InstrItineraryData InstrItins; 65 int LDSBankCount = 0; 66 unsigned MaxPrivateElementSize = 0; 67 68 // Possibly statically set by tablegen, but may want to be overridden. 69 bool FastDenormalF32 = false; 70 bool HalfRate64Ops = false; 71 bool FullRate64Ops = false; 72 73 // Dynamically set bits that enable features. 74 bool FlatForGlobal = false; 75 bool AutoWaitcntBeforeBarrier = false; 76 bool BackOffBarrier = false; 77 bool UnalignedScratchAccess = false; 78 bool UnalignedAccessMode = false; 79 bool HasApertureRegs = false; 80 bool SupportsXNACK = false; 81 bool KernargPreload = false; 82 83 // This should not be used directly. 'TargetID' tracks the dynamic settings 84 // for XNACK. 85 bool EnableXNACK = false; 86 87 bool EnableTgSplit = false; 88 bool EnableCuMode = false; 89 bool TrapHandler = false; 90 bool EnablePreciseMemory = false; 91 92 // Used as options. 93 bool EnableLoadStoreOpt = false; 94 bool EnableUnsafeDSOffsetFolding = false; 95 bool EnableSIScheduler = false; 96 bool EnableDS128 = false; 97 bool EnablePRTStrictNull = false; 98 bool DumpCode = false; 99 100 // Subtarget statically properties set by tablegen 101 bool FP64 = false; 102 bool FMA = false; 103 bool MIMG_R128 = false; 104 bool CIInsts = false; 105 bool GFX8Insts = false; 106 bool GFX9Insts = false; 107 bool GFX90AInsts = false; 108 bool GFX940Insts = false; 109 bool GFX10Insts = false; 110 bool GFX11Insts = false; 111 bool GFX12Insts = false; 112 bool GFX10_3Insts = false; 113 bool GFX7GFX8GFX9Insts = false; 114 bool SGPRInitBug = false; 115 bool UserSGPRInit16Bug = false; 116 bool NegativeScratchOffsetBug = false; 117 bool NegativeUnalignedScratchOffsetBug = false; 118 bool HasSMemRealTime = false; 119 bool HasIntClamp = false; 120 bool HasFmaMixInsts = false; 121 bool HasMovrel = false; 122 bool HasVGPRIndexMode = false; 123 bool HasScalarDwordx3Loads = false; 124 bool HasScalarStores = false; 125 bool HasScalarAtomics = false; 126 bool HasSDWAOmod = false; 127 bool HasSDWAScalar = false; 128 bool HasSDWASdst = false; 129 bool HasSDWAMac = false; 130 bool HasSDWAOutModsVOPC = false; 131 bool HasDPP = false; 132 bool HasDPP8 = false; 133 bool HasDPALU_DPP = false; 134 bool HasDPPSrc1SGPR = false; 135 bool HasPackedFP32Ops = false; 136 bool HasImageInsts = false; 137 bool HasExtendedImageInsts = false; 138 bool HasR128A16 = false; 139 bool HasA16 = false; 140 bool HasG16 = false; 141 bool HasNSAEncoding = false; 142 bool HasPartialNSAEncoding = false; 143 bool GFX10_AEncoding = false; 144 bool GFX10_BEncoding = false; 145 bool HasDLInsts = false; 146 bool HasFmacF64Inst = false; 147 bool HasDot1Insts = false; 148 bool HasDot2Insts = false; 149 bool HasDot3Insts = false; 150 bool HasDot4Insts = false; 151 bool HasDot5Insts = false; 152 bool HasDot6Insts = false; 153 bool HasDot7Insts = false; 154 bool HasDot8Insts = false; 155 bool HasDot9Insts = false; 156 bool HasDot10Insts = false; 157 bool HasDot11Insts = false; 158 bool HasMAIInsts = false; 159 bool HasFP8Insts = false; 160 bool HasFP8ConversionInsts = false; 161 bool HasPkFmacF16Inst = false; 162 bool HasAtomicFMinFMaxF32GlobalInsts = false; 163 bool HasAtomicFMinFMaxF64GlobalInsts = false; 164 bool HasAtomicFMinFMaxF32FlatInsts = false; 165 bool HasAtomicFMinFMaxF64FlatInsts = false; 166 bool HasAtomicDsPkAdd16Insts = false; 167 bool HasAtomicFlatPkAdd16Insts = false; 168 bool HasAtomicFaddRtnInsts = false; 169 bool HasAtomicFaddNoRtnInsts = false; 170 bool HasMemoryAtomicFaddF32DenormalSupport = false; 171 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; 172 bool HasAtomicBufferGlobalPkAddF16Insts = false; 173 bool HasAtomicCSubNoRtnInsts = false; 174 bool HasAtomicGlobalPkAddBF16Inst = false; 175 bool HasAtomicBufferPkAddBF16Inst = false; 176 bool HasFlatAtomicFaddF32Inst = false; 177 bool HasFlatBufferGlobalAtomicFaddF64Inst = false; 178 bool HasDefaultComponentZero = false; 179 bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false; 180 bool HasDefaultComponentBroadcast = false; 181 /// The maximum number of instructions that may be placed within an S_CLAUSE, 182 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0 183 /// indicates a lack of S_CLAUSE support. 184 unsigned MaxHardClauseLength = 0; 185 bool SupportsSRAMECC = false; 186 187 // This should not be used directly. 'TargetID' tracks the dynamic settings 188 // for SRAMECC. 189 bool EnableSRAMECC = false; 190 191 bool HasNoSdstCMPX = false; 192 bool HasVscnt = false; 193 bool HasGetWaveIdInst = false; 194 bool HasSMemTimeInst = false; 195 bool HasShaderCyclesRegister = false; 196 bool HasShaderCyclesHiLoRegisters = false; 197 bool HasVOP3Literal = false; 198 bool HasNoDataDepHazard = false; 199 bool FlatAddressSpace = false; 200 bool FlatInstOffsets = false; 201 bool FlatGlobalInsts = false; 202 bool FlatScratchInsts = false; 203 bool ScalarFlatScratchInsts = false; 204 bool HasArchitectedFlatScratch = false; 205 bool EnableFlatScratch = false; 206 bool HasArchitectedSGPRs = false; 207 bool HasGDS = false; 208 bool HasGWS = false; 209 bool AddNoCarryInsts = false; 210 bool HasUnpackedD16VMem = false; 211 bool LDSMisalignedBug = false; 212 bool HasMFMAInlineLiteralBug = false; 213 bool UnalignedBufferAccess = false; 214 bool UnalignedDSAccess = false; 215 bool HasPackedTID = false; 216 bool ScalarizeGlobal = false; 217 bool HasSALUFloatInsts = false; 218 bool HasVGPRSingleUseHintInsts = false; 219 bool HasPseudoScalarTrans = false; 220 bool HasRestrictedSOffset = false; 221 222 bool HasVcmpxPermlaneHazard = false; 223 bool HasVMEMtoScalarWriteHazard = false; 224 bool HasSMEMtoVectorWriteHazard = false; 225 bool HasInstFwdPrefetchBug = false; 226 bool HasVcmpxExecWARHazard = false; 227 bool HasLdsBranchVmemWARHazard = false; 228 bool HasNSAtoVMEMBug = false; 229 bool HasNSAClauseBug = false; 230 bool HasOffset3fBug = false; 231 bool HasFlatSegmentOffsetBug = false; 232 bool HasImageStoreD16Bug = false; 233 bool HasImageGather4D16Bug = false; 234 bool HasMSAALoadDstSelBug = false; 235 bool HasPrivEnabledTrap2NopBug = false; 236 bool Has1_5xVGPRs = false; 237 bool HasMADIntraFwdBug = false; 238 bool HasVOPDInsts = false; 239 bool HasVALUTransUseHazard = false; 240 bool HasForceStoreSC0SC1 = false; 241 bool HasRequiredExportPriority = false; 242 bool HasVmemWriteVgprInOrder = false; 243 244 bool RequiresCOV6 = false; 245 246 // Dummy feature to use for assembler in tablegen. 247 bool FeatureDisable = false; 248 249 SelectionDAGTargetInfo TSInfo; 250 private: 251 SIInstrInfo InstrInfo; 252 SITargetLowering TLInfo; 253 SIFrameLowering FrameLowering; 254 255 public: 256 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 257 const GCNTargetMachine &TM); 258 ~GCNSubtarget() override; 259 260 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 261 StringRef GPU, StringRef FS); 262 263 /// Diagnose inconsistent subtarget features before attempting to codegen 264 /// function \p F. 265 void checkSubtargetFeatures(const Function &F) const; 266 267 const SIInstrInfo *getInstrInfo() const override { 268 return &InstrInfo; 269 } 270 271 const SIFrameLowering *getFrameLowering() const override { 272 return &FrameLowering; 273 } 274 275 const SITargetLowering *getTargetLowering() const override { 276 return &TLInfo; 277 } 278 279 const SIRegisterInfo *getRegisterInfo() const override { 280 return &InstrInfo.getRegisterInfo(); 281 } 282 283 const CallLowering *getCallLowering() const override { 284 return CallLoweringInfo.get(); 285 } 286 287 const InlineAsmLowering *getInlineAsmLowering() const override { 288 return InlineAsmLoweringInfo.get(); 289 } 290 291 InstructionSelector *getInstructionSelector() const override { 292 return InstSelector.get(); 293 } 294 295 const LegalizerInfo *getLegalizerInfo() const override { 296 return Legalizer.get(); 297 } 298 299 const AMDGPURegisterBankInfo *getRegBankInfo() const override { 300 return RegBankInfo.get(); 301 } 302 303 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 304 return TargetID; 305 } 306 307 // Nothing implemented, just prevent crashes on use. 308 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 309 return &TSInfo; 310 } 311 312 const InstrItineraryData *getInstrItineraryData() const override { 313 return &InstrItins; 314 } 315 316 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 317 318 Generation getGeneration() const { 319 return (Generation)Gen; 320 } 321 322 unsigned getMaxWaveScratchSize() const { 323 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 324 if (getGeneration() >= GFX12) { 325 // 18-bit field in units of 64-dword. 326 return (64 * 4) * ((1 << 18) - 1); 327 } 328 if (getGeneration() == GFX11) { 329 // 15-bit field in units of 64-dword. 330 return (64 * 4) * ((1 << 15) - 1); 331 } 332 // 13-bit field in units of 256-dword. 333 return (256 * 4) * ((1 << 13) - 1); 334 } 335 336 /// Return the number of high bits known to be zero for a frame index. 337 unsigned getKnownHighZeroBitsForFrameIndex() const { 338 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 339 } 340 341 int getLDSBankCount() const { 342 return LDSBankCount; 343 } 344 345 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 346 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 347 } 348 349 unsigned getConstantBusLimit(unsigned Opcode) const; 350 351 /// Returns if the result of this instruction with a 16-bit result returned in 352 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 353 /// the original value. 354 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 355 356 bool supportsWGP() const { return getGeneration() >= GFX10; } 357 358 bool hasIntClamp() const { 359 return HasIntClamp; 360 } 361 362 bool hasFP64() const { 363 return FP64; 364 } 365 366 bool hasMIMG_R128() const { 367 return MIMG_R128; 368 } 369 370 bool hasHWFP64() const { 371 return FP64; 372 } 373 374 bool hasHalfRate64Ops() const { 375 return HalfRate64Ops; 376 } 377 378 bool hasFullRate64Ops() const { 379 return FullRate64Ops; 380 } 381 382 bool hasAddr64() const { 383 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 384 } 385 386 bool hasFlat() const { 387 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 388 } 389 390 // Return true if the target only has the reverse operand versions of VALU 391 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 392 bool hasOnlyRevVALUShifts() const { 393 return getGeneration() >= VOLCANIC_ISLANDS; 394 } 395 396 bool hasFractBug() const { 397 return getGeneration() == SOUTHERN_ISLANDS; 398 } 399 400 bool hasBFE() const { 401 return true; 402 } 403 404 bool hasBFI() const { 405 return true; 406 } 407 408 bool hasBFM() const { 409 return hasBFE(); 410 } 411 412 bool hasBCNT(unsigned Size) const { 413 return true; 414 } 415 416 bool hasFFBL() const { 417 return true; 418 } 419 420 bool hasFFBH() const { 421 return true; 422 } 423 424 bool hasMed3_16() const { 425 return getGeneration() >= AMDGPUSubtarget::GFX9; 426 } 427 428 bool hasMin3Max3_16() const { 429 return getGeneration() >= AMDGPUSubtarget::GFX9; 430 } 431 432 bool hasFmaMixInsts() const { 433 return HasFmaMixInsts; 434 } 435 436 bool hasCARRY() const { 437 return true; 438 } 439 440 bool hasFMA() const { 441 return FMA; 442 } 443 444 bool hasSwap() const { 445 return GFX9Insts; 446 } 447 448 bool hasScalarPackInsts() const { 449 return GFX9Insts; 450 } 451 452 bool hasScalarMulHiInsts() const { 453 return GFX9Insts; 454 } 455 456 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; } 457 458 TrapHandlerAbi getTrapHandlerAbi() const { 459 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 460 } 461 462 bool supportsGetDoorbellID() const { 463 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 464 return getGeneration() >= GFX9; 465 } 466 467 /// True if the offset field of DS instructions works as expected. On SI, the 468 /// offset uses a 16-bit adder and does not always wrap properly. 469 bool hasUsableDSOffset() const { 470 return getGeneration() >= SEA_ISLANDS; 471 } 472 473 bool unsafeDSOffsetFoldingEnabled() const { 474 return EnableUnsafeDSOffsetFolding; 475 } 476 477 /// Condition output from div_scale is usable. 478 bool hasUsableDivScaleConditionOutput() const { 479 return getGeneration() != SOUTHERN_ISLANDS; 480 } 481 482 /// Extra wait hazard is needed in some cases before 483 /// s_cbranch_vccnz/s_cbranch_vccz. 484 bool hasReadVCCZBug() const { 485 return getGeneration() <= SEA_ISLANDS; 486 } 487 488 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 489 bool partialVCCWritesUpdateVCCZ() const { 490 return getGeneration() >= GFX10; 491 } 492 493 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 494 /// was written by a VALU instruction. 495 bool hasSMRDReadVALUDefHazard() const { 496 return getGeneration() == SOUTHERN_ISLANDS; 497 } 498 499 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 500 /// SGPR was written by a VALU Instruction. 501 bool hasVMEMReadSGPRVALUDefHazard() const { 502 return getGeneration() >= VOLCANIC_ISLANDS; 503 } 504 505 bool hasRFEHazards() const { 506 return getGeneration() >= VOLCANIC_ISLANDS; 507 } 508 509 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 510 unsigned getSetRegWaitStates() const { 511 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 512 } 513 514 bool dumpCode() const { 515 return DumpCode; 516 } 517 518 /// Return the amount of LDS that can be used that will not restrict the 519 /// occupancy lower than WaveCount. 520 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 521 const Function &) const; 522 523 bool supportsMinMaxDenormModes() const { 524 return getGeneration() >= AMDGPUSubtarget::GFX9; 525 } 526 527 /// \returns If target supports S_DENORM_MODE. 528 bool hasDenormModeInst() const { 529 return getGeneration() >= AMDGPUSubtarget::GFX10; 530 } 531 532 bool useFlatForGlobal() const { 533 return FlatForGlobal; 534 } 535 536 /// \returns If target supports ds_read/write_b128 and user enables generation 537 /// of ds_read/write_b128. 538 bool useDS128() const { 539 return CIInsts && EnableDS128; 540 } 541 542 /// \return If target supports ds_read/write_b96/128. 543 bool hasDS96AndDS128() const { 544 return CIInsts; 545 } 546 547 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 548 bool haveRoundOpsF64() const { 549 return CIInsts; 550 } 551 552 /// \returns If MUBUF instructions always perform range checking, even for 553 /// buffer resources used for private memory access. 554 bool privateMemoryResourceIsRangeChecked() const { 555 return getGeneration() < AMDGPUSubtarget::GFX9; 556 } 557 558 /// \returns If target requires PRT Struct NULL support (zero result registers 559 /// for sparse texture support). 560 bool usePRTStrictNull() const { 561 return EnablePRTStrictNull; 562 } 563 564 bool hasAutoWaitcntBeforeBarrier() const { 565 return AutoWaitcntBeforeBarrier; 566 } 567 568 /// \returns true if the target supports backing off of s_barrier instructions 569 /// when an exception is raised. 570 bool supportsBackOffBarrier() const { 571 return BackOffBarrier; 572 } 573 574 bool hasUnalignedBufferAccess() const { 575 return UnalignedBufferAccess; 576 } 577 578 bool hasUnalignedBufferAccessEnabled() const { 579 return UnalignedBufferAccess && UnalignedAccessMode; 580 } 581 582 bool hasUnalignedDSAccess() const { 583 return UnalignedDSAccess; 584 } 585 586 bool hasUnalignedDSAccessEnabled() const { 587 return UnalignedDSAccess && UnalignedAccessMode; 588 } 589 590 bool hasUnalignedScratchAccess() const { 591 return UnalignedScratchAccess; 592 } 593 594 bool hasUnalignedAccessMode() const { 595 return UnalignedAccessMode; 596 } 597 598 bool hasApertureRegs() const { 599 return HasApertureRegs; 600 } 601 602 bool isTrapHandlerEnabled() const { 603 return TrapHandler; 604 } 605 606 bool isXNACKEnabled() const { 607 return TargetID.isXnackOnOrAny(); 608 } 609 610 bool isTgSplitEnabled() const { 611 return EnableTgSplit; 612 } 613 614 bool isCuModeEnabled() const { 615 return EnableCuMode; 616 } 617 618 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } 619 620 bool hasFlatAddressSpace() const { 621 return FlatAddressSpace; 622 } 623 624 bool hasFlatScrRegister() const { 625 return hasFlatAddressSpace(); 626 } 627 628 bool hasFlatInstOffsets() const { 629 return FlatInstOffsets; 630 } 631 632 bool hasFlatGlobalInsts() const { 633 return FlatGlobalInsts; 634 } 635 636 bool hasFlatScratchInsts() const { 637 return FlatScratchInsts; 638 } 639 640 // Check if target supports ST addressing mode with FLAT scratch instructions. 641 // The ST addressing mode means no registers are used, either VGPR or SGPR, 642 // but only immediate offset is swizzled and added to the FLAT scratch base. 643 bool hasFlatScratchSTMode() const { 644 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 645 } 646 647 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 648 649 bool hasScalarFlatScratchInsts() const { 650 return ScalarFlatScratchInsts; 651 } 652 653 bool enableFlatScratch() const { 654 return flatScratchIsArchitected() || 655 (EnableFlatScratch && hasFlatScratchInsts()); 656 } 657 658 bool hasGlobalAddTidInsts() const { 659 return GFX10_BEncoding; 660 } 661 662 bool hasAtomicCSub() const { 663 return GFX10_BEncoding; 664 } 665 666 bool hasExportInsts() const { 667 return !hasGFX940Insts(); 668 } 669 670 bool hasVINTERPEncoding() const { 671 return GFX11Insts; 672 } 673 674 // DS_ADD_F64/DS_ADD_RTN_F64 675 bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } 676 677 bool hasMultiDwordFlatScratchAddressing() const { 678 return getGeneration() >= GFX9; 679 } 680 681 bool hasFlatSegmentOffsetBug() const { 682 return HasFlatSegmentOffsetBug; 683 } 684 685 bool hasFlatLgkmVMemCountInOrder() const { 686 return getGeneration() > GFX9; 687 } 688 689 bool hasD16LoadStore() const { 690 return getGeneration() >= GFX9; 691 } 692 693 bool d16PreservesUnusedBits() const { 694 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 695 } 696 697 bool hasD16Images() const { 698 return getGeneration() >= VOLCANIC_ISLANDS; 699 } 700 701 /// Return if most LDS instructions have an m0 use that require m0 to be 702 /// initialized. 703 bool ldsRequiresM0Init() const { 704 return getGeneration() < GFX9; 705 } 706 707 // True if the hardware rewinds and replays GWS operations if a wave is 708 // preempted. 709 // 710 // If this is false, a GWS operation requires testing if a nack set the 711 // MEM_VIOL bit, and repeating if so. 712 bool hasGWSAutoReplay() const { 713 return getGeneration() >= GFX9; 714 } 715 716 /// \returns if target has ds_gws_sema_release_all instruction. 717 bool hasGWSSemaReleaseAll() const { 718 return CIInsts; 719 } 720 721 /// \returns true if the target has integer add/sub instructions that do not 722 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 723 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 724 /// for saturation. 725 bool hasAddNoCarry() const { 726 return AddNoCarryInsts; 727 } 728 729 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } 730 731 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } 732 733 bool hasUnpackedD16VMem() const { 734 return HasUnpackedD16VMem; 735 } 736 737 // Covers VS/PS/CS graphics shaders 738 bool isMesaGfxShader(const Function &F) const { 739 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 740 } 741 742 bool hasMad64_32() const { 743 return getGeneration() >= SEA_ISLANDS; 744 } 745 746 bool hasSDWAOmod() const { 747 return HasSDWAOmod; 748 } 749 750 bool hasSDWAScalar() const { 751 return HasSDWAScalar; 752 } 753 754 bool hasSDWASdst() const { 755 return HasSDWASdst; 756 } 757 758 bool hasSDWAMac() const { 759 return HasSDWAMac; 760 } 761 762 bool hasSDWAOutModsVOPC() const { 763 return HasSDWAOutModsVOPC; 764 } 765 766 bool hasDLInsts() const { 767 return HasDLInsts; 768 } 769 770 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 771 772 bool hasDot1Insts() const { 773 return HasDot1Insts; 774 } 775 776 bool hasDot2Insts() const { 777 return HasDot2Insts; 778 } 779 780 bool hasDot3Insts() const { 781 return HasDot3Insts; 782 } 783 784 bool hasDot4Insts() const { 785 return HasDot4Insts; 786 } 787 788 bool hasDot5Insts() const { 789 return HasDot5Insts; 790 } 791 792 bool hasDot6Insts() const { 793 return HasDot6Insts; 794 } 795 796 bool hasDot7Insts() const { 797 return HasDot7Insts; 798 } 799 800 bool hasDot8Insts() const { 801 return HasDot8Insts; 802 } 803 804 bool hasDot9Insts() const { 805 return HasDot9Insts; 806 } 807 808 bool hasDot10Insts() const { 809 return HasDot10Insts; 810 } 811 812 bool hasDot11Insts() const { 813 return HasDot11Insts; 814 } 815 816 bool hasMAIInsts() const { 817 return HasMAIInsts; 818 } 819 820 bool hasFP8Insts() const { 821 return HasFP8Insts; 822 } 823 824 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } 825 826 bool hasPkFmacF16Inst() const { 827 return HasPkFmacF16Inst; 828 } 829 830 bool hasAtomicFMinFMaxF32GlobalInsts() const { 831 return HasAtomicFMinFMaxF32GlobalInsts; 832 } 833 834 bool hasAtomicFMinFMaxF64GlobalInsts() const { 835 return HasAtomicFMinFMaxF64GlobalInsts; 836 } 837 838 bool hasAtomicFMinFMaxF32FlatInsts() const { 839 return HasAtomicFMinFMaxF32FlatInsts; 840 } 841 842 bool hasAtomicFMinFMaxF64FlatInsts() const { 843 return HasAtomicFMinFMaxF64FlatInsts; 844 } 845 846 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } 847 848 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } 849 850 bool hasAtomicFaddInsts() const { 851 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 852 } 853 854 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 855 856 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 857 858 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { 859 return HasAtomicBufferGlobalPkAddF16NoRtnInsts; 860 } 861 862 bool hasAtomicBufferGlobalPkAddF16Insts() const { 863 return HasAtomicBufferGlobalPkAddF16Insts; 864 } 865 866 bool hasAtomicGlobalPkAddBF16Inst() const { 867 return HasAtomicGlobalPkAddBF16Inst; 868 } 869 870 bool hasAtomicBufferPkAddBF16Inst() const { 871 return HasAtomicBufferPkAddBF16Inst; 872 } 873 874 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 875 876 /// \return true if the target has flat, global, and buffer atomic fadd for 877 /// double. 878 bool hasFlatBufferGlobalAtomicFaddF64Inst() const { 879 return HasFlatBufferGlobalAtomicFaddF64Inst; 880 } 881 882 /// \return true if the target's flat, global, and buffer atomic fadd for 883 /// float supports denormal handling. 884 bool hasMemoryAtomicFaddF32DenormalSupport() const { 885 return HasMemoryAtomicFaddF32DenormalSupport; 886 } 887 888 /// \return true if atomic operations targeting fine-grained memory work 889 /// correctly at device scope, in allocations in host or peer PCIe device 890 /// memory. 891 bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const { 892 return HasAgentScopeFineGrainedRemoteMemoryAtomics; 893 } 894 895 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } 896 897 bool hasDefaultComponentBroadcast() const { 898 return HasDefaultComponentBroadcast; 899 } 900 901 bool hasNoSdstCMPX() const { 902 return HasNoSdstCMPX; 903 } 904 905 bool hasVscnt() const { 906 return HasVscnt; 907 } 908 909 bool hasGetWaveIdInst() const { 910 return HasGetWaveIdInst; 911 } 912 913 bool hasSMemTimeInst() const { 914 return HasSMemTimeInst; 915 } 916 917 bool hasShaderCyclesRegister() const { 918 return HasShaderCyclesRegister; 919 } 920 921 bool hasShaderCyclesHiLoRegisters() const { 922 return HasShaderCyclesHiLoRegisters; 923 } 924 925 bool hasVOP3Literal() const { 926 return HasVOP3Literal; 927 } 928 929 bool hasNoDataDepHazard() const { 930 return HasNoDataDepHazard; 931 } 932 933 bool vmemWriteNeedsExpWaitcnt() const { 934 return getGeneration() < SEA_ISLANDS; 935 } 936 937 bool hasInstPrefetch() const { 938 return getGeneration() == GFX10 || getGeneration() == GFX11; 939 } 940 941 bool hasPrefetch() const { return GFX12Insts; } 942 943 // Has s_cmpk_* instructions. 944 bool hasSCmpK() const { return getGeneration() < GFX12; } 945 946 // Scratch is allocated in 256 dword per wave blocks for the entire 947 // wavefront. When viewed from the perspective of an arbitrary workitem, this 948 // is 4-byte aligned. 949 // 950 // Only 4-byte alignment is really needed to access anything. Transformations 951 // on the pointer value itself may rely on the alignment / known low bits of 952 // the pointer. Set this to something above the minimum to avoid needing 953 // dynamic realignment in common cases. 954 Align getStackAlignment() const { return Align(16); } 955 956 bool enableMachineScheduler() const override { 957 return true; 958 } 959 960 bool useAA() const override; 961 962 bool enableSubRegLiveness() const override { 963 return true; 964 } 965 966 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 967 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 968 969 // static wrappers 970 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 971 972 // XXX - Why is this here if it isn't in the default pass set? 973 bool enableEarlyIfConversion() const override { 974 return true; 975 } 976 977 void overrideSchedPolicy(MachineSchedPolicy &Policy, 978 unsigned NumRegionInstrs) const override; 979 980 void mirFileLoaded(MachineFunction &MF) const override; 981 982 unsigned getMaxNumUserSGPRs() const { 983 return AMDGPU::getMaxNumUserSGPRs(*this); 984 } 985 986 bool hasSMemRealTime() const { 987 return HasSMemRealTime; 988 } 989 990 bool hasMovrel() const { 991 return HasMovrel; 992 } 993 994 bool hasVGPRIndexMode() const { 995 return HasVGPRIndexMode; 996 } 997 998 bool useVGPRIndexMode() const; 999 1000 bool hasScalarCompareEq64() const { 1001 return getGeneration() >= VOLCANIC_ISLANDS; 1002 } 1003 1004 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; } 1005 1006 bool hasScalarStores() const { 1007 return HasScalarStores; 1008 } 1009 1010 bool hasScalarAtomics() const { 1011 return HasScalarAtomics; 1012 } 1013 1014 bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } 1015 bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } 1016 1017 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 1018 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 1019 1020 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 1021 bool hasPermLane64() const { return getGeneration() >= GFX11; } 1022 1023 bool hasDPP() const { 1024 return HasDPP; 1025 } 1026 1027 bool hasDPPBroadcasts() const { 1028 return HasDPP && getGeneration() < GFX10; 1029 } 1030 1031 bool hasDPPWavefrontShifts() const { 1032 return HasDPP && getGeneration() < GFX10; 1033 } 1034 1035 bool hasDPP8() const { 1036 return HasDPP8; 1037 } 1038 1039 bool hasDPALU_DPP() const { 1040 return HasDPALU_DPP; 1041 } 1042 1043 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } 1044 1045 bool hasPackedFP32Ops() const { 1046 return HasPackedFP32Ops; 1047 } 1048 1049 // Has V_PK_MOV_B32 opcode 1050 bool hasPkMovB32() const { 1051 return GFX90AInsts; 1052 } 1053 1054 bool hasFmaakFmamkF32Insts() const { 1055 return getGeneration() >= GFX10 || hasGFX940Insts(); 1056 } 1057 1058 bool hasImageInsts() const { 1059 return HasImageInsts; 1060 } 1061 1062 bool hasExtendedImageInsts() const { 1063 return HasExtendedImageInsts; 1064 } 1065 1066 bool hasR128A16() const { 1067 return HasR128A16; 1068 } 1069 1070 bool hasA16() const { return HasA16; } 1071 1072 bool hasG16() const { return HasG16; } 1073 1074 bool hasOffset3fBug() const { 1075 return HasOffset3fBug; 1076 } 1077 1078 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 1079 1080 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 1081 1082 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 1083 1084 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } 1085 1086 bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; } 1087 1088 bool hasNSAEncoding() const { return HasNSAEncoding; } 1089 1090 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } 1091 1092 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } 1093 1094 unsigned getNSAMaxSize(bool HasSampler = false) const { 1095 return AMDGPU::getNSAMaxSize(*this, HasSampler); 1096 } 1097 1098 bool hasGFX10_AEncoding() const { 1099 return GFX10_AEncoding; 1100 } 1101 1102 bool hasGFX10_BEncoding() const { 1103 return GFX10_BEncoding; 1104 } 1105 1106 bool hasGFX10_3Insts() const { 1107 return GFX10_3Insts; 1108 } 1109 1110 bool hasMadF16() const; 1111 1112 bool hasMovB64() const { return GFX940Insts; } 1113 1114 bool hasLshlAddB64() const { return GFX940Insts; } 1115 1116 bool enableSIScheduler() const { 1117 return EnableSIScheduler; 1118 } 1119 1120 bool loadStoreOptEnabled() const { 1121 return EnableLoadStoreOpt; 1122 } 1123 1124 bool hasSGPRInitBug() const { 1125 return SGPRInitBug; 1126 } 1127 1128 bool hasUserSGPRInit16Bug() const { 1129 return UserSGPRInit16Bug && isWave32(); 1130 } 1131 1132 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 1133 1134 bool hasNegativeUnalignedScratchOffsetBug() const { 1135 return NegativeUnalignedScratchOffsetBug; 1136 } 1137 1138 bool hasMFMAInlineLiteralBug() const { 1139 return HasMFMAInlineLiteralBug; 1140 } 1141 1142 bool has12DWordStoreHazard() const { 1143 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 1144 } 1145 1146 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1147 bool hasDwordx3LoadStores() const { 1148 return CIInsts; 1149 } 1150 1151 bool hasReadM0MovRelInterpHazard() const { 1152 return getGeneration() == AMDGPUSubtarget::GFX9; 1153 } 1154 1155 bool hasReadM0SendMsgHazard() const { 1156 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1157 getGeneration() <= AMDGPUSubtarget::GFX9; 1158 } 1159 1160 bool hasReadM0LdsDmaHazard() const { 1161 return getGeneration() == AMDGPUSubtarget::GFX9; 1162 } 1163 1164 bool hasReadM0LdsDirectHazard() const { 1165 return getGeneration() == AMDGPUSubtarget::GFX9; 1166 } 1167 1168 bool hasVcmpxPermlaneHazard() const { 1169 return HasVcmpxPermlaneHazard; 1170 } 1171 1172 bool hasVMEMtoScalarWriteHazard() const { 1173 return HasVMEMtoScalarWriteHazard; 1174 } 1175 1176 bool hasSMEMtoVectorWriteHazard() const { 1177 return HasSMEMtoVectorWriteHazard; 1178 } 1179 1180 bool hasLDSMisalignedBug() const { 1181 return LDSMisalignedBug && !EnableCuMode; 1182 } 1183 1184 bool hasInstFwdPrefetchBug() const { 1185 return HasInstFwdPrefetchBug; 1186 } 1187 1188 bool hasVcmpxExecWARHazard() const { 1189 return HasVcmpxExecWARHazard; 1190 } 1191 1192 bool hasLdsBranchVmemWARHazard() const { 1193 return HasLdsBranchVmemWARHazard; 1194 } 1195 1196 // Shift amount of a 64 bit shift cannot be a highest allocated register 1197 // if also at the end of the allocation block. 1198 bool hasShift64HighRegBug() const { 1199 return GFX90AInsts && !GFX940Insts; 1200 } 1201 1202 // Has one cycle hazard on transcendental instruction feeding a 1203 // non transcendental VALU. 1204 bool hasTransForwardingHazard() const { return GFX940Insts; } 1205 1206 // Has one cycle hazard on a VALU instruction partially writing dst with 1207 // a shift of result bits feeding another VALU instruction. 1208 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1209 1210 // Cannot use op_sel with v_dot instructions. 1211 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } 1212 1213 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1214 bool hasVDecCoExecHazard() const { 1215 return GFX940Insts; 1216 } 1217 1218 bool hasNSAtoVMEMBug() const { 1219 return HasNSAtoVMEMBug; 1220 } 1221 1222 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1223 1224 bool hasHardClauses() const { return MaxHardClauseLength > 0; } 1225 1226 bool hasGFX90AInsts() const { return GFX90AInsts; } 1227 1228 bool hasFPAtomicToDenormModeHazard() const { 1229 return getGeneration() == GFX10; 1230 } 1231 1232 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1233 1234 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1235 1236 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; } 1237 1238 bool hasVALUPartialForwardingHazard() const { 1239 return getGeneration() == GFX11; 1240 } 1241 1242 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1243 1244 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } 1245 1246 bool requiresCodeObjectV6() const { return RequiresCOV6; } 1247 1248 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } 1249 1250 /// Return if operations acting on VGPR tuples require even alignment. 1251 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1252 1253 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1254 bool hasSPackHL() const { return GFX11Insts; } 1255 1256 /// Return true if the target's EXP instruction has the COMPR flag, which 1257 /// affects the meaning of the EN (enable) bits. 1258 bool hasCompressedExport() const { return !GFX11Insts; } 1259 1260 /// Return true if the target's EXP instruction supports the NULL export 1261 /// target. 1262 bool hasNullExportTarget() const { return !GFX11Insts; } 1263 1264 bool has1_5xVGPRs() const { return Has1_5xVGPRs; } 1265 1266 bool hasVOPDInsts() const { return HasVOPDInsts; } 1267 1268 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1269 1270 /// Return true if the target has the S_DELAY_ALU instruction. 1271 bool hasDelayAlu() const { return GFX11Insts; } 1272 1273 bool hasPackedTID() const { return HasPackedTID; } 1274 1275 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1276 // hasGFX90AInsts is also true. 1277 bool hasGFX940Insts() const { return GFX940Insts; } 1278 1279 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } 1280 1281 bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; } 1282 1283 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } 1284 1285 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } 1286 1287 bool hasRequiredExportPriority() const { return HasRequiredExportPriority; } 1288 1289 bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; } 1290 1291 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt 1292 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. 1293 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } 1294 1295 /// \returns true if inline constants are not supported for F16 pseudo 1296 /// scalar transcendentals. 1297 bool hasNoF16PseudoScalarTransInlineConstants() const { 1298 return getGeneration() == GFX12; 1299 } 1300 1301 /// \returns The maximum number of instructions that can be enclosed in an 1302 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that 1303 /// instruction. 1304 unsigned maxHardClauseLength() const { return MaxHardClauseLength; } 1305 1306 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1307 /// SGPRs 1308 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1309 1310 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1311 /// VGPRs 1312 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1313 1314 /// Return occupancy for the given function. Used LDS and a number of 1315 /// registers if provided. 1316 /// Note, occupancy can be affected by the scratch allocation as well, but 1317 /// we do not have enough information to compute it. 1318 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1319 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1320 1321 /// \returns true if the flat_scratch register should be initialized with the 1322 /// pointer to the wave's scratch memory rather than a size and offset. 1323 bool flatScratchIsPointer() const { 1324 return getGeneration() >= AMDGPUSubtarget::GFX9; 1325 } 1326 1327 /// \returns true if the flat_scratch register is initialized by the HW. 1328 /// In this case it is readonly. 1329 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1330 1331 /// \returns true if the architected SGPRs are enabled. 1332 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } 1333 1334 /// \returns true if Global Data Share is supported. 1335 bool hasGDS() const { return HasGDS; } 1336 1337 /// \returns true if Global Wave Sync is supported. 1338 bool hasGWS() const { return HasGWS; } 1339 1340 /// \returns true if the machine has merged shaders in which s0-s7 are 1341 /// reserved by the hardware and user SGPRs start at s8 1342 bool hasMergedShaders() const { 1343 return getGeneration() >= GFX9; 1344 } 1345 1346 // \returns true if the target supports the pre-NGG legacy geometry path. 1347 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1348 1349 // \returns true if preloading kernel arguments is supported. 1350 bool hasKernargPreload() const { return KernargPreload; } 1351 1352 // \returns true if the target has split barriers feature 1353 bool hasSplitBarriers() const { return getGeneration() >= GFX12; } 1354 1355 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. 1356 bool hasCvtFP8VOP1Bug() const { return true; } 1357 1358 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a 1359 // no-return form. 1360 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; } 1361 1362 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit 1363 bool hasDX10ClampMode() const { return getGeneration() < GFX12; } 1364 1365 // \returns true if the target has IEEE kernel descriptor mode bit 1366 bool hasIEEEMode() const { return getGeneration() < GFX12; } 1367 1368 // \returns true if the target has IEEE fminimum/fmaximum instructions 1369 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; } 1370 1371 // \returns true if the target has IEEE fminimum3/fmaximum3 instructions 1372 bool hasIEEEMinMax3() const { return hasIEEEMinMax(); } 1373 1374 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit 1375 bool hasRrWGMode() const { return getGeneration() >= GFX12; } 1376 1377 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative 1378 /// values. 1379 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } 1380 1381 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead 1382 // of sign-extending. 1383 bool hasGetPCZeroExtension() const { return GFX12Insts; } 1384 1385 /// \returns SGPR allocation granularity supported by the subtarget. 1386 unsigned getSGPRAllocGranule() const { 1387 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1388 } 1389 1390 /// \returns SGPR encoding granularity supported by the subtarget. 1391 unsigned getSGPREncodingGranule() const { 1392 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1393 } 1394 1395 /// \returns Total number of SGPRs supported by the subtarget. 1396 unsigned getTotalNumSGPRs() const { 1397 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1398 } 1399 1400 /// \returns Addressable number of SGPRs supported by the subtarget. 1401 unsigned getAddressableNumSGPRs() const { 1402 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1403 } 1404 1405 /// \returns Minimum number of SGPRs that meets the given number of waves per 1406 /// execution unit requirement supported by the subtarget. 1407 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1408 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1409 } 1410 1411 /// \returns Maximum number of SGPRs that meets the given number of waves per 1412 /// execution unit requirement supported by the subtarget. 1413 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1414 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1415 } 1416 1417 /// \returns Reserved number of SGPRs. This is common 1418 /// utility function called by MachineFunction and 1419 /// Function variants of getReservedNumSGPRs. 1420 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1421 /// \returns Reserved number of SGPRs for given machine function \p MF. 1422 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1423 1424 /// \returns Reserved number of SGPRs for given function \p F. 1425 unsigned getReservedNumSGPRs(const Function &F) const; 1426 1427 /// \returns max num SGPRs. This is the common utility 1428 /// function called by MachineFunction and Function 1429 /// variants of getMaxNumSGPRs. 1430 unsigned getBaseMaxNumSGPRs(const Function &F, 1431 std::pair<unsigned, unsigned> WavesPerEU, 1432 unsigned PreloadedSGPRs, 1433 unsigned ReservedNumSGPRs) const; 1434 1435 /// \returns Maximum number of SGPRs that meets number of waves per execution 1436 /// unit requirement for function \p MF, or number of SGPRs explicitly 1437 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1438 /// 1439 /// \returns Value that meets number of waves per execution unit requirement 1440 /// if explicitly requested value cannot be converted to integer, violates 1441 /// subtarget's specifications, or does not meet number of waves per execution 1442 /// unit requirement. 1443 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1444 1445 /// \returns Maximum number of SGPRs that meets number of waves per execution 1446 /// unit requirement for function \p F, or number of SGPRs explicitly 1447 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1448 /// 1449 /// \returns Value that meets number of waves per execution unit requirement 1450 /// if explicitly requested value cannot be converted to integer, violates 1451 /// subtarget's specifications, or does not meet number of waves per execution 1452 /// unit requirement. 1453 unsigned getMaxNumSGPRs(const Function &F) const; 1454 1455 /// \returns VGPR allocation granularity supported by the subtarget. 1456 unsigned getVGPRAllocGranule() const { 1457 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1458 } 1459 1460 /// \returns VGPR encoding granularity supported by the subtarget. 1461 unsigned getVGPREncodingGranule() const { 1462 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1463 } 1464 1465 /// \returns Total number of VGPRs supported by the subtarget. 1466 unsigned getTotalNumVGPRs() const { 1467 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1468 } 1469 1470 /// \returns Addressable number of architectural VGPRs supported by the 1471 /// subtarget. 1472 unsigned getAddressableNumArchVGPRs() const { 1473 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); 1474 } 1475 1476 /// \returns Addressable number of VGPRs supported by the subtarget. 1477 unsigned getAddressableNumVGPRs() const { 1478 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1479 } 1480 1481 /// \returns the minimum number of VGPRs that will prevent achieving more than 1482 /// the specified number of waves \p WavesPerEU. 1483 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1484 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1485 } 1486 1487 /// \returns the maximum number of VGPRs that can be used and still achieved 1488 /// at least the specified number of waves \p WavesPerEU. 1489 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1490 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1491 } 1492 1493 /// \returns max num VGPRs. This is the common utility function 1494 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1495 unsigned getBaseMaxNumVGPRs(const Function &F, 1496 std::pair<unsigned, unsigned> WavesPerEU) const; 1497 /// \returns Maximum number of VGPRs that meets number of waves per execution 1498 /// unit requirement for function \p F, or number of VGPRs explicitly 1499 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1500 /// 1501 /// \returns Value that meets number of waves per execution unit requirement 1502 /// if explicitly requested value cannot be converted to integer, violates 1503 /// subtarget's specifications, or does not meet number of waves per execution 1504 /// unit requirement. 1505 unsigned getMaxNumVGPRs(const Function &F) const; 1506 1507 unsigned getMaxNumAGPRs(const Function &F) const { 1508 return getMaxNumVGPRs(F); 1509 } 1510 1511 /// \returns Maximum number of VGPRs that meets number of waves per execution 1512 /// unit requirement for function \p MF, or number of VGPRs explicitly 1513 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1514 /// 1515 /// \returns Value that meets number of waves per execution unit requirement 1516 /// if explicitly requested value cannot be converted to integer, violates 1517 /// subtarget's specifications, or does not meet number of waves per execution 1518 /// unit requirement. 1519 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1520 1521 void getPostRAMutations( 1522 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1523 const override; 1524 1525 std::unique_ptr<ScheduleDAGMutation> 1526 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1527 1528 bool isWave32() const { 1529 return getWavefrontSize() == 32; 1530 } 1531 1532 bool isWave64() const { 1533 return getWavefrontSize() == 64; 1534 } 1535 1536 const TargetRegisterClass *getBoolRC() const { 1537 return getRegisterInfo()->getBoolRC(); 1538 } 1539 1540 /// \returns Maximum number of work groups per compute unit supported by the 1541 /// subtarget and limited by given \p FlatWorkGroupSize. 1542 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1543 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1544 } 1545 1546 /// \returns Minimum flat work group size supported by the subtarget. 1547 unsigned getMinFlatWorkGroupSize() const override { 1548 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1549 } 1550 1551 /// \returns Maximum flat work group size supported by the subtarget. 1552 unsigned getMaxFlatWorkGroupSize() const override { 1553 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1554 } 1555 1556 /// \returns Number of waves per execution unit required to support the given 1557 /// \p FlatWorkGroupSize. 1558 unsigned 1559 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1560 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1561 } 1562 1563 /// \returns Minimum number of waves per execution unit supported by the 1564 /// subtarget. 1565 unsigned getMinWavesPerEU() const override { 1566 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1567 } 1568 1569 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1570 SDep &Dep, 1571 const TargetSchedModel *SchedModel) const override; 1572 1573 // \returns true if it's beneficial on this subtarget for the scheduler to 1574 // cluster stores as well as loads. 1575 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1576 1577 // \returns the number of address arguments from which to enable MIMG NSA 1578 // on supported architectures. 1579 unsigned getNSAThreshold(const MachineFunction &MF) const; 1580 1581 // \returns true if the subtarget has a hazard requiring an "s_nop 0" 1582 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". 1583 bool requiresNopBeforeDeallocVGPRs() const { 1584 // Currently all targets that support the dealloc VGPRs message also require 1585 // the nop. 1586 return true; 1587 } 1588 }; 1589 1590 class GCNUserSGPRUsageInfo { 1591 public: 1592 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } 1593 1594 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } 1595 1596 bool hasDispatchPtr() const { return DispatchPtr; } 1597 1598 bool hasQueuePtr() const { return QueuePtr; } 1599 1600 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } 1601 1602 bool hasDispatchID() const { return DispatchID; } 1603 1604 bool hasFlatScratchInit() const { return FlatScratchInit; } 1605 1606 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; } 1607 1608 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } 1609 1610 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } 1611 1612 unsigned getNumFreeUserSGPRs(); 1613 1614 void allocKernargPreloadSGPRs(unsigned NumSGPRs); 1615 1616 enum UserSGPRID : unsigned { 1617 ImplicitBufferPtrID = 0, 1618 PrivateSegmentBufferID = 1, 1619 DispatchPtrID = 2, 1620 QueuePtrID = 3, 1621 KernargSegmentPtrID = 4, 1622 DispatchIdID = 5, 1623 FlatScratchInitID = 6, 1624 PrivateSegmentSizeID = 7 1625 }; 1626 1627 // Returns the size in number of SGPRs for preload user SGPR field. 1628 static unsigned getNumUserSGPRForField(UserSGPRID ID) { 1629 switch (ID) { 1630 case ImplicitBufferPtrID: 1631 return 2; 1632 case PrivateSegmentBufferID: 1633 return 4; 1634 case DispatchPtrID: 1635 return 2; 1636 case QueuePtrID: 1637 return 2; 1638 case KernargSegmentPtrID: 1639 return 2; 1640 case DispatchIdID: 1641 return 2; 1642 case FlatScratchInitID: 1643 return 2; 1644 case PrivateSegmentSizeID: 1645 return 1; 1646 } 1647 llvm_unreachable("Unknown UserSGPRID."); 1648 } 1649 1650 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); 1651 1652 private: 1653 const GCNSubtarget &ST; 1654 1655 // Private memory buffer 1656 // Compute directly in sgpr[0:1] 1657 // Other shaders indirect 64-bits at sgpr[0:1] 1658 bool ImplicitBufferPtr = false; 1659 1660 bool PrivateSegmentBuffer = false; 1661 1662 bool DispatchPtr = false; 1663 1664 bool QueuePtr = false; 1665 1666 bool KernargSegmentPtr = false; 1667 1668 bool DispatchID = false; 1669 1670 bool FlatScratchInit = false; 1671 1672 bool PrivateSegmentSize = false; 1673 1674 unsigned NumKernargPreloadSGPRs = 0; 1675 1676 unsigned NumUsedUserSGPRs = 0; 1677 }; 1678 1679 } // end namespace llvm 1680 1681 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1682