1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "SIFrameLowering.h" 21 #include "SIISelLowering.h" 22 #include "SIInstrInfo.h" 23 #include "Utils/AMDGPUBaseInfo.h" 24 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 25 26 #define GET_SUBTARGETINFO_HEADER 27 #include "AMDGPUGenSubtargetInfo.inc" 28 29 namespace llvm { 30 31 class GCNTargetMachine; 32 33 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 34 public AMDGPUSubtarget { 35 public: 36 using AMDGPUSubtarget::getMaxWavesPerEU; 37 38 // Following 2 enums are documented at: 39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 40 enum class TrapHandlerAbi { 41 NONE = 0x00, 42 AMDHSA = 0x01, 43 }; 44 45 enum class TrapID { 46 LLVMAMDHSATrap = 0x02, 47 LLVMAMDHSADebugTrap = 0x03, 48 }; 49 50 private: 51 /// GlobalISel related APIs. 52 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 53 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 54 std::unique_ptr<InstructionSelector> InstSelector; 55 std::unique_ptr<LegalizerInfo> Legalizer; 56 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; 57 58 protected: 59 // Basic subtarget description. 60 Triple TargetTriple; 61 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 62 unsigned Gen = INVALID; 63 InstrItineraryData InstrItins; 64 int LDSBankCount = 0; 65 unsigned MaxPrivateElementSize = 0; 66 67 // Possibly statically set by tablegen, but may want to be overridden. 68 bool FastDenormalF32 = false; 69 bool HalfRate64Ops = false; 70 bool FullRate64Ops = false; 71 72 // Dynamically set bits that enable features. 73 bool FlatForGlobal = false; 74 bool AutoWaitcntBeforeBarrier = false; 75 bool BackOffBarrier = false; 76 bool UnalignedScratchAccess = false; 77 bool UnalignedAccessMode = false; 78 bool HasApertureRegs = false; 79 bool SupportsXNACK = false; 80 81 // This should not be used directly. 'TargetID' tracks the dynamic settings 82 // for XNACK. 83 bool EnableXNACK = false; 84 85 bool EnableTgSplit = false; 86 bool EnableCuMode = false; 87 bool TrapHandler = false; 88 89 // Used as options. 90 bool EnableLoadStoreOpt = false; 91 bool EnableUnsafeDSOffsetFolding = false; 92 bool EnableSIScheduler = false; 93 bool EnableDS128 = false; 94 bool EnablePRTStrictNull = false; 95 bool DumpCode = false; 96 97 // Subtarget statically properties set by tablegen 98 bool FP64 = false; 99 bool FMA = false; 100 bool MIMG_R128 = false; 101 bool CIInsts = false; 102 bool GFX8Insts = false; 103 bool GFX9Insts = false; 104 bool GFX90AInsts = false; 105 bool GFX940Insts = false; 106 bool GFX10Insts = false; 107 bool GFX11Insts = false; 108 bool GFX10_3Insts = false; 109 bool GFX7GFX8GFX9Insts = false; 110 bool SGPRInitBug = false; 111 bool UserSGPRInit16Bug = false; 112 bool NegativeScratchOffsetBug = false; 113 bool NegativeUnalignedScratchOffsetBug = false; 114 bool HasSMemRealTime = false; 115 bool HasIntClamp = false; 116 bool HasFmaMixInsts = false; 117 bool HasMovrel = false; 118 bool HasVGPRIndexMode = false; 119 bool HasScalarStores = false; 120 bool HasScalarAtomics = false; 121 bool HasSDWAOmod = false; 122 bool HasSDWAScalar = false; 123 bool HasSDWASdst = false; 124 bool HasSDWAMac = false; 125 bool HasSDWAOutModsVOPC = false; 126 bool HasDPP = false; 127 bool HasDPP8 = false; 128 bool Has64BitDPP = false; 129 bool HasPackedFP32Ops = false; 130 bool HasImageInsts = false; 131 bool HasExtendedImageInsts = false; 132 bool HasR128A16 = false; 133 bool HasA16 = false; 134 bool HasG16 = false; 135 bool HasNSAEncoding = false; 136 bool HasPartialNSAEncoding = false; 137 bool GFX10_AEncoding = false; 138 bool GFX10_BEncoding = false; 139 bool HasDLInsts = false; 140 bool HasFmacF64Inst = false; 141 bool HasDot1Insts = false; 142 bool HasDot2Insts = false; 143 bool HasDot3Insts = false; 144 bool HasDot4Insts = false; 145 bool HasDot5Insts = false; 146 bool HasDot6Insts = false; 147 bool HasDot7Insts = false; 148 bool HasDot8Insts = false; 149 bool HasDot9Insts = false; 150 bool HasDot10Insts = false; 151 bool HasMAIInsts = false; 152 bool HasFP8Insts = false; 153 bool HasPkFmacF16Inst = false; 154 bool HasAtomicDsPkAdd16Insts = false; 155 bool HasAtomicFlatPkAdd16Insts = false; 156 bool HasAtomicFaddRtnInsts = false; 157 bool HasAtomicFaddNoRtnInsts = false; 158 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; 159 bool HasAtomicBufferGlobalPkAddF16Insts = false; 160 bool HasAtomicGlobalPkAddBF16Inst = false; 161 bool HasFlatAtomicFaddF32Inst = false; 162 bool SupportsSRAMECC = false; 163 164 // This should not be used directly. 'TargetID' tracks the dynamic settings 165 // for SRAMECC. 166 bool EnableSRAMECC = false; 167 168 bool HasNoSdstCMPX = false; 169 bool HasVscnt = false; 170 bool HasGetWaveIdInst = false; 171 bool HasSMemTimeInst = false; 172 bool HasShaderCyclesRegister = false; 173 bool HasVOP3Literal = false; 174 bool HasNoDataDepHazard = false; 175 bool FlatAddressSpace = false; 176 bool FlatInstOffsets = false; 177 bool FlatGlobalInsts = false; 178 bool FlatScratchInsts = false; 179 bool ScalarFlatScratchInsts = false; 180 bool HasArchitectedFlatScratch = false; 181 bool EnableFlatScratch = false; 182 bool HasArchitectedSGPRs = false; 183 bool AddNoCarryInsts = false; 184 bool HasUnpackedD16VMem = false; 185 bool LDSMisalignedBug = false; 186 bool HasMFMAInlineLiteralBug = false; 187 bool UnalignedBufferAccess = false; 188 bool UnalignedDSAccess = false; 189 bool HasPackedTID = false; 190 bool ScalarizeGlobal = false; 191 192 bool HasVcmpxPermlaneHazard = false; 193 bool HasVMEMtoScalarWriteHazard = false; 194 bool HasSMEMtoVectorWriteHazard = false; 195 bool HasInstFwdPrefetchBug = false; 196 bool HasVcmpxExecWARHazard = false; 197 bool HasLdsBranchVmemWARHazard = false; 198 bool HasNSAtoVMEMBug = false; 199 bool HasNSAClauseBug = false; 200 bool HasOffset3fBug = false; 201 bool HasFlatSegmentOffsetBug = false; 202 bool HasImageStoreD16Bug = false; 203 bool HasImageGather4D16Bug = false; 204 bool HasGFX11FullVGPRs = false; 205 bool HasMADIntraFwdBug = false; 206 bool HasVOPDInsts = false; 207 bool HasVALUTransUseHazard = false; 208 bool HasForceStoreSC0SC1 = false; 209 210 // Dummy feature to use for assembler in tablegen. 211 bool FeatureDisable = false; 212 213 SelectionDAGTargetInfo TSInfo; 214 private: 215 SIInstrInfo InstrInfo; 216 SITargetLowering TLInfo; 217 SIFrameLowering FrameLowering; 218 219 public: 220 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 221 const GCNTargetMachine &TM); 222 ~GCNSubtarget() override; 223 224 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 225 StringRef GPU, StringRef FS); 226 227 const SIInstrInfo *getInstrInfo() const override { 228 return &InstrInfo; 229 } 230 231 const SIFrameLowering *getFrameLowering() const override { 232 return &FrameLowering; 233 } 234 235 const SITargetLowering *getTargetLowering() const override { 236 return &TLInfo; 237 } 238 239 const SIRegisterInfo *getRegisterInfo() const override { 240 return &InstrInfo.getRegisterInfo(); 241 } 242 243 const CallLowering *getCallLowering() const override { 244 return CallLoweringInfo.get(); 245 } 246 247 const InlineAsmLowering *getInlineAsmLowering() const override { 248 return InlineAsmLoweringInfo.get(); 249 } 250 251 InstructionSelector *getInstructionSelector() const override { 252 return InstSelector.get(); 253 } 254 255 const LegalizerInfo *getLegalizerInfo() const override { 256 return Legalizer.get(); 257 } 258 259 const AMDGPURegisterBankInfo *getRegBankInfo() const override { 260 return RegBankInfo.get(); 261 } 262 263 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 264 return TargetID; 265 } 266 267 // Nothing implemented, just prevent crashes on use. 268 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 269 return &TSInfo; 270 } 271 272 const InstrItineraryData *getInstrItineraryData() const override { 273 return &InstrItins; 274 } 275 276 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 277 278 Generation getGeneration() const { 279 return (Generation)Gen; 280 } 281 282 unsigned getMaxWaveScratchSize() const { 283 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 284 if (getGeneration() < GFX11) { 285 // 13-bit field in units of 256-dword. 286 return (256 * 4) * ((1 << 13) - 1); 287 } 288 // 15-bit field in units of 64-dword. 289 return (64 * 4) * ((1 << 15) - 1); 290 } 291 292 /// Return the number of high bits known to be zero for a frame index. 293 unsigned getKnownHighZeroBitsForFrameIndex() const { 294 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 295 } 296 297 int getLDSBankCount() const { 298 return LDSBankCount; 299 } 300 301 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 302 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 303 } 304 305 unsigned getConstantBusLimit(unsigned Opcode) const; 306 307 /// Returns if the result of this instruction with a 16-bit result returned in 308 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 309 /// the original value. 310 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 311 312 bool supportsWGP() const { return getGeneration() >= GFX10; } 313 314 bool hasIntClamp() const { 315 return HasIntClamp; 316 } 317 318 bool hasFP64() const { 319 return FP64; 320 } 321 322 bool hasMIMG_R128() const { 323 return MIMG_R128; 324 } 325 326 bool hasHWFP64() const { 327 return FP64; 328 } 329 330 bool hasHalfRate64Ops() const { 331 return HalfRate64Ops; 332 } 333 334 bool hasFullRate64Ops() const { 335 return FullRate64Ops; 336 } 337 338 bool hasAddr64() const { 339 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 340 } 341 342 bool hasFlat() const { 343 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 344 } 345 346 // Return true if the target only has the reverse operand versions of VALU 347 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 348 bool hasOnlyRevVALUShifts() const { 349 return getGeneration() >= VOLCANIC_ISLANDS; 350 } 351 352 bool hasFractBug() const { 353 return getGeneration() == SOUTHERN_ISLANDS; 354 } 355 356 bool hasBFE() const { 357 return true; 358 } 359 360 bool hasBFI() const { 361 return true; 362 } 363 364 bool hasBFM() const { 365 return hasBFE(); 366 } 367 368 bool hasBCNT(unsigned Size) const { 369 return true; 370 } 371 372 bool hasFFBL() const { 373 return true; 374 } 375 376 bool hasFFBH() const { 377 return true; 378 } 379 380 bool hasMed3_16() const { 381 return getGeneration() >= AMDGPUSubtarget::GFX9; 382 } 383 384 bool hasMin3Max3_16() const { 385 return getGeneration() >= AMDGPUSubtarget::GFX9; 386 } 387 388 bool hasFmaMixInsts() const { 389 return HasFmaMixInsts; 390 } 391 392 bool hasCARRY() const { 393 return true; 394 } 395 396 bool hasFMA() const { 397 return FMA; 398 } 399 400 bool hasSwap() const { 401 return GFX9Insts; 402 } 403 404 bool hasScalarPackInsts() const { 405 return GFX9Insts; 406 } 407 408 bool hasScalarMulHiInsts() const { 409 return GFX9Insts; 410 } 411 412 TrapHandlerAbi getTrapHandlerAbi() const { 413 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 414 } 415 416 bool supportsGetDoorbellID() const { 417 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 418 return getGeneration() >= GFX9; 419 } 420 421 /// True if the offset field of DS instructions works as expected. On SI, the 422 /// offset uses a 16-bit adder and does not always wrap properly. 423 bool hasUsableDSOffset() const { 424 return getGeneration() >= SEA_ISLANDS; 425 } 426 427 bool unsafeDSOffsetFoldingEnabled() const { 428 return EnableUnsafeDSOffsetFolding; 429 } 430 431 /// Condition output from div_scale is usable. 432 bool hasUsableDivScaleConditionOutput() const { 433 return getGeneration() != SOUTHERN_ISLANDS; 434 } 435 436 /// Extra wait hazard is needed in some cases before 437 /// s_cbranch_vccnz/s_cbranch_vccz. 438 bool hasReadVCCZBug() const { 439 return getGeneration() <= SEA_ISLANDS; 440 } 441 442 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 443 bool partialVCCWritesUpdateVCCZ() const { 444 return getGeneration() >= GFX10; 445 } 446 447 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 448 /// was written by a VALU instruction. 449 bool hasSMRDReadVALUDefHazard() const { 450 return getGeneration() == SOUTHERN_ISLANDS; 451 } 452 453 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 454 /// SGPR was written by a VALU Instruction. 455 bool hasVMEMReadSGPRVALUDefHazard() const { 456 return getGeneration() >= VOLCANIC_ISLANDS; 457 } 458 459 bool hasRFEHazards() const { 460 return getGeneration() >= VOLCANIC_ISLANDS; 461 } 462 463 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 464 unsigned getSetRegWaitStates() const { 465 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 466 } 467 468 bool dumpCode() const { 469 return DumpCode; 470 } 471 472 /// Return the amount of LDS that can be used that will not restrict the 473 /// occupancy lower than WaveCount. 474 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 475 const Function &) const; 476 477 bool supportsMinMaxDenormModes() const { 478 return getGeneration() >= AMDGPUSubtarget::GFX9; 479 } 480 481 /// \returns If target supports S_DENORM_MODE. 482 bool hasDenormModeInst() const { 483 return getGeneration() >= AMDGPUSubtarget::GFX10; 484 } 485 486 bool useFlatForGlobal() const { 487 return FlatForGlobal; 488 } 489 490 /// \returns If target supports ds_read/write_b128 and user enables generation 491 /// of ds_read/write_b128. 492 bool useDS128() const { 493 return CIInsts && EnableDS128; 494 } 495 496 /// \return If target supports ds_read/write_b96/128. 497 bool hasDS96AndDS128() const { 498 return CIInsts; 499 } 500 501 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 502 bool haveRoundOpsF64() const { 503 return CIInsts; 504 } 505 506 /// \returns If MUBUF instructions always perform range checking, even for 507 /// buffer resources used for private memory access. 508 bool privateMemoryResourceIsRangeChecked() const { 509 return getGeneration() < AMDGPUSubtarget::GFX9; 510 } 511 512 /// \returns If target requires PRT Struct NULL support (zero result registers 513 /// for sparse texture support). 514 bool usePRTStrictNull() const { 515 return EnablePRTStrictNull; 516 } 517 518 bool hasAutoWaitcntBeforeBarrier() const { 519 return AutoWaitcntBeforeBarrier; 520 } 521 522 /// \returns true if the target supports backing off of s_barrier instructions 523 /// when an exception is raised. 524 bool supportsBackOffBarrier() const { 525 return BackOffBarrier; 526 } 527 528 bool hasUnalignedBufferAccess() const { 529 return UnalignedBufferAccess; 530 } 531 532 bool hasUnalignedBufferAccessEnabled() const { 533 return UnalignedBufferAccess && UnalignedAccessMode; 534 } 535 536 bool hasUnalignedDSAccess() const { 537 return UnalignedDSAccess; 538 } 539 540 bool hasUnalignedDSAccessEnabled() const { 541 return UnalignedDSAccess && UnalignedAccessMode; 542 } 543 544 bool hasUnalignedScratchAccess() const { 545 return UnalignedScratchAccess; 546 } 547 548 bool hasUnalignedAccessMode() const { 549 return UnalignedAccessMode; 550 } 551 552 bool hasApertureRegs() const { 553 return HasApertureRegs; 554 } 555 556 bool isTrapHandlerEnabled() const { 557 return TrapHandler; 558 } 559 560 bool isXNACKEnabled() const { 561 return TargetID.isXnackOnOrAny(); 562 } 563 564 bool isTgSplitEnabled() const { 565 return EnableTgSplit; 566 } 567 568 bool isCuModeEnabled() const { 569 return EnableCuMode; 570 } 571 572 bool hasFlatAddressSpace() const { 573 return FlatAddressSpace; 574 } 575 576 bool hasFlatScrRegister() const { 577 return hasFlatAddressSpace(); 578 } 579 580 bool hasFlatInstOffsets() const { 581 return FlatInstOffsets; 582 } 583 584 bool hasFlatGlobalInsts() const { 585 return FlatGlobalInsts; 586 } 587 588 bool hasFlatScratchInsts() const { 589 return FlatScratchInsts; 590 } 591 592 // Check if target supports ST addressing mode with FLAT scratch instructions. 593 // The ST addressing mode means no registers are used, either VGPR or SGPR, 594 // but only immediate offset is swizzled and added to the FLAT scratch base. 595 bool hasFlatScratchSTMode() const { 596 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 597 } 598 599 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 600 601 bool hasScalarFlatScratchInsts() const { 602 return ScalarFlatScratchInsts; 603 } 604 605 bool enableFlatScratch() const { 606 return flatScratchIsArchitected() || 607 (EnableFlatScratch && hasFlatScratchInsts()); 608 } 609 610 bool hasGlobalAddTidInsts() const { 611 return GFX10_BEncoding; 612 } 613 614 bool hasAtomicCSub() const { 615 return GFX10_BEncoding; 616 } 617 618 bool hasMultiDwordFlatScratchAddressing() const { 619 return getGeneration() >= GFX9; 620 } 621 622 bool hasFlatSegmentOffsetBug() const { 623 return HasFlatSegmentOffsetBug; 624 } 625 626 bool hasFlatLgkmVMemCountInOrder() const { 627 return getGeneration() > GFX9; 628 } 629 630 bool hasD16LoadStore() const { 631 return getGeneration() >= GFX9; 632 } 633 634 bool d16PreservesUnusedBits() const { 635 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 636 } 637 638 bool hasD16Images() const { 639 return getGeneration() >= VOLCANIC_ISLANDS; 640 } 641 642 /// Return if most LDS instructions have an m0 use that require m0 to be 643 /// initialized. 644 bool ldsRequiresM0Init() const { 645 return getGeneration() < GFX9; 646 } 647 648 // True if the hardware rewinds and replays GWS operations if a wave is 649 // preempted. 650 // 651 // If this is false, a GWS operation requires testing if a nack set the 652 // MEM_VIOL bit, and repeating if so. 653 bool hasGWSAutoReplay() const { 654 return getGeneration() >= GFX9; 655 } 656 657 /// \returns if target has ds_gws_sema_release_all instruction. 658 bool hasGWSSemaReleaseAll() const { 659 return CIInsts; 660 } 661 662 /// \returns true if the target has integer add/sub instructions that do not 663 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 664 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 665 /// for saturation. 666 bool hasAddNoCarry() const { 667 return AddNoCarryInsts; 668 } 669 670 bool hasUnpackedD16VMem() const { 671 return HasUnpackedD16VMem; 672 } 673 674 // Covers VS/PS/CS graphics shaders 675 bool isMesaGfxShader(const Function &F) const { 676 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 677 } 678 679 bool hasMad64_32() const { 680 return getGeneration() >= SEA_ISLANDS; 681 } 682 683 bool hasSDWAOmod() const { 684 return HasSDWAOmod; 685 } 686 687 bool hasSDWAScalar() const { 688 return HasSDWAScalar; 689 } 690 691 bool hasSDWASdst() const { 692 return HasSDWASdst; 693 } 694 695 bool hasSDWAMac() const { 696 return HasSDWAMac; 697 } 698 699 bool hasSDWAOutModsVOPC() const { 700 return HasSDWAOutModsVOPC; 701 } 702 703 bool hasDLInsts() const { 704 return HasDLInsts; 705 } 706 707 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 708 709 bool hasDot1Insts() const { 710 return HasDot1Insts; 711 } 712 713 bool hasDot2Insts() const { 714 return HasDot2Insts; 715 } 716 717 bool hasDot3Insts() const { 718 return HasDot3Insts; 719 } 720 721 bool hasDot4Insts() const { 722 return HasDot4Insts; 723 } 724 725 bool hasDot5Insts() const { 726 return HasDot5Insts; 727 } 728 729 bool hasDot6Insts() const { 730 return HasDot6Insts; 731 } 732 733 bool hasDot7Insts() const { 734 return HasDot7Insts; 735 } 736 737 bool hasDot8Insts() const { 738 return HasDot8Insts; 739 } 740 741 bool hasDot9Insts() const { 742 return HasDot9Insts; 743 } 744 745 bool hasDot10Insts() const { 746 return HasDot10Insts; 747 } 748 749 bool hasMAIInsts() const { 750 return HasMAIInsts; 751 } 752 753 bool hasFP8Insts() const { 754 return HasFP8Insts; 755 } 756 757 bool hasPkFmacF16Inst() const { 758 return HasPkFmacF16Inst; 759 } 760 761 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } 762 763 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } 764 765 bool hasAtomicFaddInsts() const { 766 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 767 } 768 769 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 770 771 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 772 773 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { 774 return HasAtomicBufferGlobalPkAddF16NoRtnInsts; 775 } 776 777 bool hasAtomicBufferGlobalPkAddF16Insts() const { 778 return HasAtomicBufferGlobalPkAddF16Insts; 779 } 780 781 bool hasAtomicGlobalPkAddBF16Inst() const { 782 return HasAtomicGlobalPkAddBF16Inst; 783 } 784 785 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 786 787 bool hasNoSdstCMPX() const { 788 return HasNoSdstCMPX; 789 } 790 791 bool hasVscnt() const { 792 return HasVscnt; 793 } 794 795 bool hasGetWaveIdInst() const { 796 return HasGetWaveIdInst; 797 } 798 799 bool hasSMemTimeInst() const { 800 return HasSMemTimeInst; 801 } 802 803 bool hasShaderCyclesRegister() const { 804 return HasShaderCyclesRegister; 805 } 806 807 bool hasVOP3Literal() const { 808 return HasVOP3Literal; 809 } 810 811 bool hasNoDataDepHazard() const { 812 return HasNoDataDepHazard; 813 } 814 815 bool vmemWriteNeedsExpWaitcnt() const { 816 return getGeneration() < SEA_ISLANDS; 817 } 818 819 bool hasInstPrefetch() const { return getGeneration() >= GFX10; } 820 821 // Scratch is allocated in 256 dword per wave blocks for the entire 822 // wavefront. When viewed from the perspective of an arbitrary workitem, this 823 // is 4-byte aligned. 824 // 825 // Only 4-byte alignment is really needed to access anything. Transformations 826 // on the pointer value itself may rely on the alignment / known low bits of 827 // the pointer. Set this to something above the minimum to avoid needing 828 // dynamic realignment in common cases. 829 Align getStackAlignment() const { return Align(16); } 830 831 bool enableMachineScheduler() const override { 832 return true; 833 } 834 835 bool useAA() const override; 836 837 bool enableSubRegLiveness() const override { 838 return true; 839 } 840 841 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 842 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 843 844 // static wrappers 845 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 846 847 // XXX - Why is this here if it isn't in the default pass set? 848 bool enableEarlyIfConversion() const override { 849 return true; 850 } 851 852 void overrideSchedPolicy(MachineSchedPolicy &Policy, 853 unsigned NumRegionInstrs) const override; 854 855 unsigned getMaxNumUserSGPRs() const { 856 return 16; 857 } 858 859 bool hasSMemRealTime() const { 860 return HasSMemRealTime; 861 } 862 863 bool hasMovrel() const { 864 return HasMovrel; 865 } 866 867 bool hasVGPRIndexMode() const { 868 return HasVGPRIndexMode; 869 } 870 871 bool useVGPRIndexMode() const; 872 873 bool hasScalarCompareEq64() const { 874 return getGeneration() >= VOLCANIC_ISLANDS; 875 } 876 877 bool hasScalarStores() const { 878 return HasScalarStores; 879 } 880 881 bool hasScalarAtomics() const { 882 return HasScalarAtomics; 883 } 884 885 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 886 887 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 888 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 889 890 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 891 bool hasPermLane64() const { return getGeneration() >= GFX11; } 892 893 bool hasDPP() const { 894 return HasDPP; 895 } 896 897 bool hasDPPBroadcasts() const { 898 return HasDPP && getGeneration() < GFX10; 899 } 900 901 bool hasDPPWavefrontShifts() const { 902 return HasDPP && getGeneration() < GFX10; 903 } 904 905 bool hasDPP8() const { 906 return HasDPP8; 907 } 908 909 bool has64BitDPP() const { 910 return Has64BitDPP; 911 } 912 913 bool hasPackedFP32Ops() const { 914 return HasPackedFP32Ops; 915 } 916 917 bool hasFmaakFmamkF32Insts() const { 918 return getGeneration() >= GFX10 || hasGFX940Insts(); 919 } 920 921 bool hasImageInsts() const { 922 return HasImageInsts; 923 } 924 925 bool hasExtendedImageInsts() const { 926 return HasExtendedImageInsts; 927 } 928 929 bool hasR128A16() const { 930 return HasR128A16; 931 } 932 933 bool hasA16() const { return HasA16; } 934 935 bool hasG16() const { return HasG16; } 936 937 bool hasOffset3fBug() const { 938 return HasOffset3fBug; 939 } 940 941 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 942 943 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 944 945 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 946 947 bool hasNSAEncoding() const { return HasNSAEncoding; } 948 949 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } 950 951 unsigned getNSAMaxSize() const { return AMDGPU::getNSAMaxSize(*this); } 952 953 bool hasGFX10_AEncoding() const { 954 return GFX10_AEncoding; 955 } 956 957 bool hasGFX10_BEncoding() const { 958 return GFX10_BEncoding; 959 } 960 961 bool hasGFX10_3Insts() const { 962 return GFX10_3Insts; 963 } 964 965 bool hasMadF16() const; 966 967 bool hasMovB64() const { return GFX940Insts; } 968 969 bool hasLshlAddB64() const { return GFX940Insts; } 970 971 bool enableSIScheduler() const { 972 return EnableSIScheduler; 973 } 974 975 bool loadStoreOptEnabled() const { 976 return EnableLoadStoreOpt; 977 } 978 979 bool hasSGPRInitBug() const { 980 return SGPRInitBug; 981 } 982 983 bool hasUserSGPRInit16Bug() const { 984 return UserSGPRInit16Bug && isWave32(); 985 } 986 987 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 988 989 bool hasNegativeUnalignedScratchOffsetBug() const { 990 return NegativeUnalignedScratchOffsetBug; 991 } 992 993 bool hasMFMAInlineLiteralBug() const { 994 return HasMFMAInlineLiteralBug; 995 } 996 997 bool has12DWordStoreHazard() const { 998 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 999 } 1000 1001 // \returns true if the subtarget supports DWORDX3 load/store instructions. 1002 bool hasDwordx3LoadStores() const { 1003 return CIInsts; 1004 } 1005 1006 bool hasReadM0MovRelInterpHazard() const { 1007 return getGeneration() == AMDGPUSubtarget::GFX9; 1008 } 1009 1010 bool hasReadM0SendMsgHazard() const { 1011 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 1012 getGeneration() <= AMDGPUSubtarget::GFX9; 1013 } 1014 1015 bool hasReadM0LdsDmaHazard() const { 1016 return getGeneration() == AMDGPUSubtarget::GFX9; 1017 } 1018 1019 bool hasReadM0LdsDirectHazard() const { 1020 return getGeneration() == AMDGPUSubtarget::GFX9; 1021 } 1022 1023 bool hasVcmpxPermlaneHazard() const { 1024 return HasVcmpxPermlaneHazard; 1025 } 1026 1027 bool hasVMEMtoScalarWriteHazard() const { 1028 return HasVMEMtoScalarWriteHazard; 1029 } 1030 1031 bool hasSMEMtoVectorWriteHazard() const { 1032 return HasSMEMtoVectorWriteHazard; 1033 } 1034 1035 bool hasLDSMisalignedBug() const { 1036 return LDSMisalignedBug && !EnableCuMode; 1037 } 1038 1039 bool hasInstFwdPrefetchBug() const { 1040 return HasInstFwdPrefetchBug; 1041 } 1042 1043 bool hasVcmpxExecWARHazard() const { 1044 return HasVcmpxExecWARHazard; 1045 } 1046 1047 bool hasLdsBranchVmemWARHazard() const { 1048 return HasLdsBranchVmemWARHazard; 1049 } 1050 1051 // Shift amount of a 64 bit shift cannot be a highest allocated register 1052 // if also at the end of the allocation block. 1053 bool hasShift64HighRegBug() const { 1054 return GFX90AInsts && !GFX940Insts; 1055 } 1056 1057 // Has one cycle hazard on transcendental instruction feeding a 1058 // non transcendental VALU. 1059 bool hasTransForwardingHazard() const { return GFX940Insts; } 1060 1061 // Has one cycle hazard on a VALU instruction partially writing dst with 1062 // a shift of result bits feeding another VALU instruction. 1063 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1064 1065 // Cannot use op_sel with v_dot instructions. 1066 bool hasDOTOpSelHazard() const { return GFX940Insts; } 1067 1068 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1069 bool hasVDecCoExecHazard() const { 1070 return GFX940Insts; 1071 } 1072 1073 bool hasNSAtoVMEMBug() const { 1074 return HasNSAtoVMEMBug; 1075 } 1076 1077 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1078 1079 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1080 1081 bool hasGFX90AInsts() const { return GFX90AInsts; } 1082 1083 bool hasFPAtomicToDenormModeHazard() const { 1084 return getGeneration() == GFX10; 1085 } 1086 1087 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1088 1089 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1090 1091 bool hasVALUPartialForwardingHazard() const { 1092 return getGeneration() >= GFX11; 1093 } 1094 1095 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1096 1097 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } 1098 1099 bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; } 1100 1101 /// Return if operations acting on VGPR tuples require even alignment. 1102 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1103 1104 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1105 bool hasSPackHL() const { return GFX11Insts; } 1106 1107 /// Return true if the target's EXP instruction has the COMPR flag, which 1108 /// affects the meaning of the EN (enable) bits. 1109 bool hasCompressedExport() const { return !GFX11Insts; } 1110 1111 /// Return true if the target's EXP instruction supports the NULL export 1112 /// target. 1113 bool hasNullExportTarget() const { return !GFX11Insts; } 1114 1115 bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; } 1116 1117 bool hasVOPDInsts() const { return HasVOPDInsts; } 1118 1119 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1120 1121 /// Return true if the target has the S_DELAY_ALU instruction. 1122 bool hasDelayAlu() const { return GFX11Insts; } 1123 1124 bool hasPackedTID() const { return HasPackedTID; } 1125 1126 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1127 // hasGFX90AInsts is also true. 1128 bool hasGFX940Insts() const { return GFX940Insts; } 1129 1130 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1131 /// SGPRs 1132 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1133 1134 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1135 /// VGPRs 1136 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1137 1138 /// Return occupancy for the given function. Used LDS and a number of 1139 /// registers if provided. 1140 /// Note, occupancy can be affected by the scratch allocation as well, but 1141 /// we do not have enough information to compute it. 1142 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1143 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1144 1145 /// \returns true if the flat_scratch register should be initialized with the 1146 /// pointer to the wave's scratch memory rather than a size and offset. 1147 bool flatScratchIsPointer() const { 1148 return getGeneration() >= AMDGPUSubtarget::GFX9; 1149 } 1150 1151 /// \returns true if the flat_scratch register is initialized by the HW. 1152 /// In this case it is readonly. 1153 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1154 1155 /// \returns true if the architected SGPRs are enabled. 1156 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } 1157 1158 /// \returns true if the machine has merged shaders in which s0-s7 are 1159 /// reserved by the hardware and user SGPRs start at s8 1160 bool hasMergedShaders() const { 1161 return getGeneration() >= GFX9; 1162 } 1163 1164 // \returns true if the target supports the pre-NGG legacy geometry path. 1165 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1166 1167 /// \returns SGPR allocation granularity supported by the subtarget. 1168 unsigned getSGPRAllocGranule() const { 1169 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1170 } 1171 1172 /// \returns SGPR encoding granularity supported by the subtarget. 1173 unsigned getSGPREncodingGranule() const { 1174 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1175 } 1176 1177 /// \returns Total number of SGPRs supported by the subtarget. 1178 unsigned getTotalNumSGPRs() const { 1179 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1180 } 1181 1182 /// \returns Addressable number of SGPRs supported by the subtarget. 1183 unsigned getAddressableNumSGPRs() const { 1184 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1185 } 1186 1187 /// \returns Minimum number of SGPRs that meets the given number of waves per 1188 /// execution unit requirement supported by the subtarget. 1189 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1190 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1191 } 1192 1193 /// \returns Maximum number of SGPRs that meets the given number of waves per 1194 /// execution unit requirement supported by the subtarget. 1195 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1196 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1197 } 1198 1199 /// \returns Reserved number of SGPRs. This is common 1200 /// utility function called by MachineFunction and 1201 /// Function variants of getReservedNumSGPRs. 1202 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1203 /// \returns Reserved number of SGPRs for given machine function \p MF. 1204 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1205 1206 /// \returns Reserved number of SGPRs for given function \p F. 1207 unsigned getReservedNumSGPRs(const Function &F) const; 1208 1209 /// \returns max num SGPRs. This is the common utility 1210 /// function called by MachineFunction and Function 1211 /// variants of getMaxNumSGPRs. 1212 unsigned getBaseMaxNumSGPRs(const Function &F, 1213 std::pair<unsigned, unsigned> WavesPerEU, 1214 unsigned PreloadedSGPRs, 1215 unsigned ReservedNumSGPRs) const; 1216 1217 /// \returns Maximum number of SGPRs that meets number of waves per execution 1218 /// unit requirement for function \p MF, or number of SGPRs explicitly 1219 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1220 /// 1221 /// \returns Value that meets number of waves per execution unit requirement 1222 /// if explicitly requested value cannot be converted to integer, violates 1223 /// subtarget's specifications, or does not meet number of waves per execution 1224 /// unit requirement. 1225 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1226 1227 /// \returns Maximum number of SGPRs that meets number of waves per execution 1228 /// unit requirement for function \p F, or number of SGPRs explicitly 1229 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1230 /// 1231 /// \returns Value that meets number of waves per execution unit requirement 1232 /// if explicitly requested value cannot be converted to integer, violates 1233 /// subtarget's specifications, or does not meet number of waves per execution 1234 /// unit requirement. 1235 unsigned getMaxNumSGPRs(const Function &F) const; 1236 1237 /// \returns VGPR allocation granularity supported by the subtarget. 1238 unsigned getVGPRAllocGranule() const { 1239 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1240 } 1241 1242 /// \returns VGPR encoding granularity supported by the subtarget. 1243 unsigned getVGPREncodingGranule() const { 1244 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1245 } 1246 1247 /// \returns Total number of VGPRs supported by the subtarget. 1248 unsigned getTotalNumVGPRs() const { 1249 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1250 } 1251 1252 /// \returns Addressable number of VGPRs supported by the subtarget. 1253 unsigned getAddressableNumVGPRs() const { 1254 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1255 } 1256 1257 /// \returns the minimum number of VGPRs that will prevent achieving more than 1258 /// the specified number of waves \p WavesPerEU. 1259 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1260 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1261 } 1262 1263 /// \returns the maximum number of VGPRs that can be used and still achieved 1264 /// at least the specified number of waves \p WavesPerEU. 1265 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1266 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1267 } 1268 1269 /// \returns max num VGPRs. This is the common utility function 1270 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1271 unsigned getBaseMaxNumVGPRs(const Function &F, 1272 std::pair<unsigned, unsigned> WavesPerEU) const; 1273 /// \returns Maximum number of VGPRs that meets number of waves per execution 1274 /// unit requirement for function \p F, or number of VGPRs explicitly 1275 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1276 /// 1277 /// \returns Value that meets number of waves per execution unit requirement 1278 /// if explicitly requested value cannot be converted to integer, violates 1279 /// subtarget's specifications, or does not meet number of waves per execution 1280 /// unit requirement. 1281 unsigned getMaxNumVGPRs(const Function &F) const; 1282 1283 unsigned getMaxNumAGPRs(const Function &F) const { 1284 return getMaxNumVGPRs(F); 1285 } 1286 1287 /// \returns Maximum number of VGPRs that meets number of waves per execution 1288 /// unit requirement for function \p MF, or number of VGPRs explicitly 1289 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1290 /// 1291 /// \returns Value that meets number of waves per execution unit requirement 1292 /// if explicitly requested value cannot be converted to integer, violates 1293 /// subtarget's specifications, or does not meet number of waves per execution 1294 /// unit requirement. 1295 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1296 1297 void getPostRAMutations( 1298 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1299 const override; 1300 1301 std::unique_ptr<ScheduleDAGMutation> 1302 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1303 1304 bool isWave32() const { 1305 return getWavefrontSize() == 32; 1306 } 1307 1308 bool isWave64() const { 1309 return getWavefrontSize() == 64; 1310 } 1311 1312 const TargetRegisterClass *getBoolRC() const { 1313 return getRegisterInfo()->getBoolRC(); 1314 } 1315 1316 /// \returns Maximum number of work groups per compute unit supported by the 1317 /// subtarget and limited by given \p FlatWorkGroupSize. 1318 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1319 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1320 } 1321 1322 /// \returns Minimum flat work group size supported by the subtarget. 1323 unsigned getMinFlatWorkGroupSize() const override { 1324 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1325 } 1326 1327 /// \returns Maximum flat work group size supported by the subtarget. 1328 unsigned getMaxFlatWorkGroupSize() const override { 1329 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1330 } 1331 1332 /// \returns Number of waves per execution unit required to support the given 1333 /// \p FlatWorkGroupSize. 1334 unsigned 1335 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1336 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1337 } 1338 1339 /// \returns Minimum number of waves per execution unit supported by the 1340 /// subtarget. 1341 unsigned getMinWavesPerEU() const override { 1342 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1343 } 1344 1345 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1346 SDep &Dep) const override; 1347 1348 // \returns true if it's beneficial on this subtarget for the scheduler to 1349 // cluster stores as well as loads. 1350 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1351 1352 // \returns the number of address arguments from which to enable MIMG NSA 1353 // on supported architectures. 1354 unsigned getNSAThreshold(const MachineFunction &MF) const; 1355 1356 // \returns true if the subtarget has a hazard requiring an "s_nop 0" 1357 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". 1358 bool requiresNopBeforeDeallocVGPRs() const { 1359 // Currently all targets that support the dealloc VGPRs message also require 1360 // the nop. 1361 return true; 1362 } 1363 }; 1364 1365 } // end namespace llvm 1366 1367 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1368