1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 #define GET_SUBTARGETINFO_HEADER 25 #include "AMDGPUGenSubtargetInfo.inc" 26 27 namespace llvm { 28 29 class GCNTargetMachine; 30 31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 32 public AMDGPUSubtarget { 33 public: 34 using AMDGPUSubtarget::getMaxWavesPerEU; 35 36 // Following 2 enums are documented at: 37 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 38 enum class TrapHandlerAbi { 39 NONE = 0x00, 40 AMDHSA = 0x01, 41 }; 42 43 enum class TrapID { 44 LLVMAMDHSATrap = 0x02, 45 LLVMAMDHSADebugTrap = 0x03, 46 }; 47 48 private: 49 /// GlobalISel related APIs. 50 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 51 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 52 std::unique_ptr<InstructionSelector> InstSelector; 53 std::unique_ptr<LegalizerInfo> Legalizer; 54 std::unique_ptr<RegisterBankInfo> RegBankInfo; 55 56 protected: 57 // Basic subtarget description. 58 Triple TargetTriple; 59 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 60 unsigned Gen = INVALID; 61 InstrItineraryData InstrItins; 62 int LDSBankCount = 0; 63 unsigned MaxPrivateElementSize = 0; 64 65 // Possibly statically set by tablegen, but may want to be overridden. 66 bool FastFMAF32 = false; 67 bool FastDenormalF32 = false; 68 bool HalfRate64Ops = false; 69 bool FullRate64Ops = false; 70 71 // Dynamically set bits that enable features. 72 bool FlatForGlobal = false; 73 bool AutoWaitcntBeforeBarrier = false; 74 bool BackOffBarrier = false; 75 bool UnalignedScratchAccess = false; 76 bool UnalignedAccessMode = false; 77 bool HasApertureRegs = false; 78 bool SupportsXNACK = false; 79 80 // This should not be used directly. 'TargetID' tracks the dynamic settings 81 // for XNACK. 82 bool EnableXNACK = false; 83 84 bool EnableTgSplit = false; 85 bool EnableCuMode = false; 86 bool TrapHandler = false; 87 88 // Used as options. 89 bool EnableLoadStoreOpt = false; 90 bool EnableUnsafeDSOffsetFolding = false; 91 bool EnableSIScheduler = false; 92 bool EnableDS128 = false; 93 bool EnablePRTStrictNull = false; 94 bool DumpCode = false; 95 96 // Subtarget statically properties set by tablegen 97 bool FP64 = false; 98 bool FMA = false; 99 bool MIMG_R128 = false; 100 bool CIInsts = false; 101 bool GFX8Insts = false; 102 bool GFX9Insts = false; 103 bool GFX90AInsts = false; 104 bool GFX940Insts = false; 105 bool GFX10Insts = false; 106 bool GFX11Insts = false; 107 bool GFX10_3Insts = false; 108 bool GFX7GFX8GFX9Insts = false; 109 bool SGPRInitBug = false; 110 bool UserSGPRInit16Bug = false; 111 bool NegativeScratchOffsetBug = false; 112 bool NegativeUnalignedScratchOffsetBug = false; 113 bool HasSMemRealTime = false; 114 bool HasIntClamp = false; 115 bool HasFmaMixInsts = false; 116 bool HasMovrel = false; 117 bool HasVGPRIndexMode = false; 118 bool HasScalarStores = false; 119 bool HasScalarAtomics = false; 120 bool HasSDWAOmod = false; 121 bool HasSDWAScalar = false; 122 bool HasSDWASdst = false; 123 bool HasSDWAMac = false; 124 bool HasSDWAOutModsVOPC = false; 125 bool HasDPP = false; 126 bool HasDPP8 = false; 127 bool Has64BitDPP = false; 128 bool HasPackedFP32Ops = false; 129 bool HasImageInsts = false; 130 bool HasExtendedImageInsts = false; 131 bool HasR128A16 = false; 132 bool HasA16 = false; 133 bool HasG16 = false; 134 bool HasNSAEncoding = false; 135 unsigned NSAMaxSize = 0; 136 bool GFX10_AEncoding = false; 137 bool GFX10_BEncoding = false; 138 bool HasDLInsts = false; 139 bool HasFmacF64Inst = false; 140 bool HasDot1Insts = false; 141 bool HasDot2Insts = false; 142 bool HasDot3Insts = false; 143 bool HasDot4Insts = false; 144 bool HasDot5Insts = false; 145 bool HasDot6Insts = false; 146 bool HasDot7Insts = false; 147 bool HasDot8Insts = false; 148 bool HasDot9Insts = false; 149 bool HasMAIInsts = false; 150 bool HasFP8Insts = false; 151 bool HasPkFmacF16Inst = false; 152 bool HasAtomicFaddRtnInsts = false; 153 bool HasAtomicFaddNoRtnInsts = false; 154 bool HasAtomicPkFaddNoRtnInsts = false; 155 bool HasFlatAtomicFaddF32Inst = false; 156 bool SupportsSRAMECC = false; 157 158 // This should not be used directly. 'TargetID' tracks the dynamic settings 159 // for SRAMECC. 160 bool EnableSRAMECC = false; 161 162 bool HasNoSdstCMPX = false; 163 bool HasVscnt = false; 164 bool HasGetWaveIdInst = false; 165 bool HasSMemTimeInst = false; 166 bool HasShaderCyclesRegister = false; 167 bool HasVOP3Literal = false; 168 bool HasNoDataDepHazard = false; 169 bool FlatAddressSpace = false; 170 bool FlatInstOffsets = false; 171 bool FlatGlobalInsts = false; 172 bool FlatScratchInsts = false; 173 bool ScalarFlatScratchInsts = false; 174 bool HasArchitectedFlatScratch = false; 175 bool EnableFlatScratch = false; 176 bool AddNoCarryInsts = false; 177 bool HasUnpackedD16VMem = false; 178 bool LDSMisalignedBug = false; 179 bool HasMFMAInlineLiteralBug = false; 180 bool UnalignedBufferAccess = false; 181 bool UnalignedDSAccess = false; 182 bool HasPackedTID = false; 183 bool ScalarizeGlobal = false; 184 185 bool HasVcmpxPermlaneHazard = false; 186 bool HasVMEMtoScalarWriteHazard = false; 187 bool HasSMEMtoVectorWriteHazard = false; 188 bool HasInstFwdPrefetchBug = false; 189 bool HasVcmpxExecWARHazard = false; 190 bool HasLdsBranchVmemWARHazard = false; 191 bool HasNSAtoVMEMBug = false; 192 bool HasNSAClauseBug = false; 193 bool HasOffset3fBug = false; 194 bool HasFlatSegmentOffsetBug = false; 195 bool HasImageStoreD16Bug = false; 196 bool HasImageGather4D16Bug = false; 197 bool HasGFX11FullVGPRs = false; 198 bool HasMADIntraFwdBug = false; 199 bool HasVOPDInsts = false; 200 bool HasVALUTransUseHazard = false; 201 202 // Dummy feature to use for assembler in tablegen. 203 bool FeatureDisable = false; 204 205 SelectionDAGTargetInfo TSInfo; 206 private: 207 SIInstrInfo InstrInfo; 208 SITargetLowering TLInfo; 209 SIFrameLowering FrameLowering; 210 211 public: 212 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 213 const GCNTargetMachine &TM); 214 ~GCNSubtarget() override; 215 216 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 217 StringRef GPU, StringRef FS); 218 219 const SIInstrInfo *getInstrInfo() const override { 220 return &InstrInfo; 221 } 222 223 const SIFrameLowering *getFrameLowering() const override { 224 return &FrameLowering; 225 } 226 227 const SITargetLowering *getTargetLowering() const override { 228 return &TLInfo; 229 } 230 231 const SIRegisterInfo *getRegisterInfo() const override { 232 return &InstrInfo.getRegisterInfo(); 233 } 234 235 const CallLowering *getCallLowering() const override { 236 return CallLoweringInfo.get(); 237 } 238 239 const InlineAsmLowering *getInlineAsmLowering() const override { 240 return InlineAsmLoweringInfo.get(); 241 } 242 243 InstructionSelector *getInstructionSelector() const override { 244 return InstSelector.get(); 245 } 246 247 const LegalizerInfo *getLegalizerInfo() const override { 248 return Legalizer.get(); 249 } 250 251 const RegisterBankInfo *getRegBankInfo() const override { 252 return RegBankInfo.get(); 253 } 254 255 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 256 return TargetID; 257 } 258 259 // Nothing implemented, just prevent crashes on use. 260 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 261 return &TSInfo; 262 } 263 264 const InstrItineraryData *getInstrItineraryData() const override { 265 return &InstrItins; 266 } 267 268 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 269 270 Generation getGeneration() const { 271 return (Generation)Gen; 272 } 273 274 unsigned getMaxWaveScratchSize() const { 275 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 276 if (getGeneration() < GFX11) { 277 // 13-bit field in units of 256-dword. 278 return (256 * 4) * ((1 << 13) - 1); 279 } 280 // 15-bit field in units of 64-dword. 281 return (64 * 4) * ((1 << 15) - 1); 282 } 283 284 /// Return the number of high bits known to be zero for a frame index. 285 unsigned getKnownHighZeroBitsForFrameIndex() const { 286 return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 287 } 288 289 int getLDSBankCount() const { 290 return LDSBankCount; 291 } 292 293 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 294 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 295 } 296 297 unsigned getConstantBusLimit(unsigned Opcode) const; 298 299 /// Returns if the result of this instruction with a 16-bit result returned in 300 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 301 /// the original value. 302 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 303 304 bool supportsWGP() const { return getGeneration() >= GFX10; } 305 306 bool hasIntClamp() const { 307 return HasIntClamp; 308 } 309 310 bool hasFP64() const { 311 return FP64; 312 } 313 314 bool hasMIMG_R128() const { 315 return MIMG_R128; 316 } 317 318 bool hasHWFP64() const { 319 return FP64; 320 } 321 322 bool hasFastFMAF32() const { 323 return FastFMAF32; 324 } 325 326 bool hasHalfRate64Ops() const { 327 return HalfRate64Ops; 328 } 329 330 bool hasFullRate64Ops() const { 331 return FullRate64Ops; 332 } 333 334 bool hasAddr64() const { 335 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 336 } 337 338 bool hasFlat() const { 339 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 340 } 341 342 // Return true if the target only has the reverse operand versions of VALU 343 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). 344 bool hasOnlyRevVALUShifts() const { 345 return getGeneration() >= VOLCANIC_ISLANDS; 346 } 347 348 bool hasFractBug() const { 349 return getGeneration() == SOUTHERN_ISLANDS; 350 } 351 352 bool hasBFE() const { 353 return true; 354 } 355 356 bool hasBFI() const { 357 return true; 358 } 359 360 bool hasBFM() const { 361 return hasBFE(); 362 } 363 364 bool hasBCNT(unsigned Size) const { 365 return true; 366 } 367 368 bool hasFFBL() const { 369 return true; 370 } 371 372 bool hasFFBH() const { 373 return true; 374 } 375 376 bool hasMed3_16() const { 377 return getGeneration() >= AMDGPUSubtarget::GFX9; 378 } 379 380 bool hasMin3Max3_16() const { 381 return getGeneration() >= AMDGPUSubtarget::GFX9; 382 } 383 384 bool hasFmaMixInsts() const { 385 return HasFmaMixInsts; 386 } 387 388 bool hasCARRY() const { 389 return true; 390 } 391 392 bool hasFMA() const { 393 return FMA; 394 } 395 396 bool hasSwap() const { 397 return GFX9Insts; 398 } 399 400 bool hasScalarPackInsts() const { 401 return GFX9Insts; 402 } 403 404 bool hasScalarMulHiInsts() const { 405 return GFX9Insts; 406 } 407 408 TrapHandlerAbi getTrapHandlerAbi() const { 409 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 410 } 411 412 bool supportsGetDoorbellID() const { 413 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 414 return getGeneration() >= GFX9; 415 } 416 417 /// True if the offset field of DS instructions works as expected. On SI, the 418 /// offset uses a 16-bit adder and does not always wrap properly. 419 bool hasUsableDSOffset() const { 420 return getGeneration() >= SEA_ISLANDS; 421 } 422 423 bool unsafeDSOffsetFoldingEnabled() const { 424 return EnableUnsafeDSOffsetFolding; 425 } 426 427 /// Condition output from div_scale is usable. 428 bool hasUsableDivScaleConditionOutput() const { 429 return getGeneration() != SOUTHERN_ISLANDS; 430 } 431 432 /// Extra wait hazard is needed in some cases before 433 /// s_cbranch_vccnz/s_cbranch_vccz. 434 bool hasReadVCCZBug() const { 435 return getGeneration() <= SEA_ISLANDS; 436 } 437 438 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. 439 bool partialVCCWritesUpdateVCCZ() const { 440 return getGeneration() >= GFX10; 441 } 442 443 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 444 /// was written by a VALU instruction. 445 bool hasSMRDReadVALUDefHazard() const { 446 return getGeneration() == SOUTHERN_ISLANDS; 447 } 448 449 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 450 /// SGPR was written by a VALU Instruction. 451 bool hasVMEMReadSGPRVALUDefHazard() const { 452 return getGeneration() >= VOLCANIC_ISLANDS; 453 } 454 455 bool hasRFEHazards() const { 456 return getGeneration() >= VOLCANIC_ISLANDS; 457 } 458 459 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. 460 unsigned getSetRegWaitStates() const { 461 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 462 } 463 464 bool dumpCode() const { 465 return DumpCode; 466 } 467 468 /// Return the amount of LDS that can be used that will not restrict the 469 /// occupancy lower than WaveCount. 470 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 471 const Function &) const; 472 473 bool supportsMinMaxDenormModes() const { 474 return getGeneration() >= AMDGPUSubtarget::GFX9; 475 } 476 477 /// \returns If target supports S_DENORM_MODE. 478 bool hasDenormModeInst() const { 479 return getGeneration() >= AMDGPUSubtarget::GFX10; 480 } 481 482 bool useFlatForGlobal() const { 483 return FlatForGlobal; 484 } 485 486 /// \returns If target supports ds_read/write_b128 and user enables generation 487 /// of ds_read/write_b128. 488 bool useDS128() const { 489 return CIInsts && EnableDS128; 490 } 491 492 /// \return If target supports ds_read/write_b96/128. 493 bool hasDS96AndDS128() const { 494 return CIInsts; 495 } 496 497 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 498 bool haveRoundOpsF64() const { 499 return CIInsts; 500 } 501 502 /// \returns If MUBUF instructions always perform range checking, even for 503 /// buffer resources used for private memory access. 504 bool privateMemoryResourceIsRangeChecked() const { 505 return getGeneration() < AMDGPUSubtarget::GFX9; 506 } 507 508 /// \returns If target requires PRT Struct NULL support (zero result registers 509 /// for sparse texture support). 510 bool usePRTStrictNull() const { 511 return EnablePRTStrictNull; 512 } 513 514 bool hasAutoWaitcntBeforeBarrier() const { 515 return AutoWaitcntBeforeBarrier; 516 } 517 518 /// \returns true if the target supports backing off of s_barrier instructions 519 /// when an exception is raised. 520 bool supportsBackOffBarrier() const { 521 return BackOffBarrier; 522 } 523 524 bool hasUnalignedBufferAccess() const { 525 return UnalignedBufferAccess; 526 } 527 528 bool hasUnalignedBufferAccessEnabled() const { 529 return UnalignedBufferAccess && UnalignedAccessMode; 530 } 531 532 bool hasUnalignedDSAccess() const { 533 return UnalignedDSAccess; 534 } 535 536 bool hasUnalignedDSAccessEnabled() const { 537 return UnalignedDSAccess && UnalignedAccessMode; 538 } 539 540 bool hasUnalignedScratchAccess() const { 541 return UnalignedScratchAccess; 542 } 543 544 bool hasUnalignedAccessMode() const { 545 return UnalignedAccessMode; 546 } 547 548 bool hasApertureRegs() const { 549 return HasApertureRegs; 550 } 551 552 bool isTrapHandlerEnabled() const { 553 return TrapHandler; 554 } 555 556 bool isXNACKEnabled() const { 557 return TargetID.isXnackOnOrAny(); 558 } 559 560 bool isTgSplitEnabled() const { 561 return EnableTgSplit; 562 } 563 564 bool isCuModeEnabled() const { 565 return EnableCuMode; 566 } 567 568 bool hasFlatAddressSpace() const { 569 return FlatAddressSpace; 570 } 571 572 bool hasFlatScrRegister() const { 573 return hasFlatAddressSpace(); 574 } 575 576 bool hasFlatInstOffsets() const { 577 return FlatInstOffsets; 578 } 579 580 bool hasFlatGlobalInsts() const { 581 return FlatGlobalInsts; 582 } 583 584 bool hasFlatScratchInsts() const { 585 return FlatScratchInsts; 586 } 587 588 // Check if target supports ST addressing mode with FLAT scratch instructions. 589 // The ST addressing mode means no registers are used, either VGPR or SGPR, 590 // but only immediate offset is swizzled and added to the FLAT scratch base. 591 bool hasFlatScratchSTMode() const { 592 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 593 } 594 595 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 596 597 bool hasScalarFlatScratchInsts() const { 598 return ScalarFlatScratchInsts; 599 } 600 601 bool enableFlatScratch() const { 602 return flatScratchIsArchitected() || 603 (EnableFlatScratch && hasFlatScratchInsts()); 604 } 605 606 bool hasGlobalAddTidInsts() const { 607 return GFX10_BEncoding; 608 } 609 610 bool hasAtomicCSub() const { 611 return GFX10_BEncoding; 612 } 613 614 bool hasMultiDwordFlatScratchAddressing() const { 615 return getGeneration() >= GFX9; 616 } 617 618 bool hasFlatSegmentOffsetBug() const { 619 return HasFlatSegmentOffsetBug; 620 } 621 622 bool hasFlatLgkmVMemCountInOrder() const { 623 return getGeneration() > GFX9; 624 } 625 626 bool hasD16LoadStore() const { 627 return getGeneration() >= GFX9; 628 } 629 630 bool d16PreservesUnusedBits() const { 631 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 632 } 633 634 bool hasD16Images() const { 635 return getGeneration() >= VOLCANIC_ISLANDS; 636 } 637 638 /// Return if most LDS instructions have an m0 use that require m0 to be 639 /// initialized. 640 bool ldsRequiresM0Init() const { 641 return getGeneration() < GFX9; 642 } 643 644 // True if the hardware rewinds and replays GWS operations if a wave is 645 // preempted. 646 // 647 // If this is false, a GWS operation requires testing if a nack set the 648 // MEM_VIOL bit, and repeating if so. 649 bool hasGWSAutoReplay() const { 650 return getGeneration() >= GFX9; 651 } 652 653 /// \returns if target has ds_gws_sema_release_all instruction. 654 bool hasGWSSemaReleaseAll() const { 655 return CIInsts; 656 } 657 658 /// \returns true if the target has integer add/sub instructions that do not 659 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 660 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 661 /// for saturation. 662 bool hasAddNoCarry() const { 663 return AddNoCarryInsts; 664 } 665 666 bool hasUnpackedD16VMem() const { 667 return HasUnpackedD16VMem; 668 } 669 670 // Covers VS/PS/CS graphics shaders 671 bool isMesaGfxShader(const Function &F) const { 672 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 673 } 674 675 bool hasMad64_32() const { 676 return getGeneration() >= SEA_ISLANDS; 677 } 678 679 bool hasSDWAOmod() const { 680 return HasSDWAOmod; 681 } 682 683 bool hasSDWAScalar() const { 684 return HasSDWAScalar; 685 } 686 687 bool hasSDWASdst() const { 688 return HasSDWASdst; 689 } 690 691 bool hasSDWAMac() const { 692 return HasSDWAMac; 693 } 694 695 bool hasSDWAOutModsVOPC() const { 696 return HasSDWAOutModsVOPC; 697 } 698 699 bool hasDLInsts() const { 700 return HasDLInsts; 701 } 702 703 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 704 705 bool hasDot1Insts() const { 706 return HasDot1Insts; 707 } 708 709 bool hasDot2Insts() const { 710 return HasDot2Insts; 711 } 712 713 bool hasDot3Insts() const { 714 return HasDot3Insts; 715 } 716 717 bool hasDot4Insts() const { 718 return HasDot4Insts; 719 } 720 721 bool hasDot5Insts() const { 722 return HasDot5Insts; 723 } 724 725 bool hasDot6Insts() const { 726 return HasDot6Insts; 727 } 728 729 bool hasDot7Insts() const { 730 return HasDot7Insts; 731 } 732 733 bool hasDot8Insts() const { 734 return HasDot8Insts; 735 } 736 737 bool hasDot9Insts() const { 738 return HasDot9Insts; 739 } 740 741 bool hasMAIInsts() const { 742 return HasMAIInsts; 743 } 744 745 bool hasFP8Insts() const { 746 return HasFP8Insts; 747 } 748 749 bool hasPkFmacF16Inst() const { 750 return HasPkFmacF16Inst; 751 } 752 753 bool hasAtomicFaddInsts() const { 754 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 755 } 756 757 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 758 759 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 760 761 bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; } 762 763 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 764 765 bool hasNoSdstCMPX() const { 766 return HasNoSdstCMPX; 767 } 768 769 bool hasVscnt() const { 770 return HasVscnt; 771 } 772 773 bool hasGetWaveIdInst() const { 774 return HasGetWaveIdInst; 775 } 776 777 bool hasSMemTimeInst() const { 778 return HasSMemTimeInst; 779 } 780 781 bool hasShaderCyclesRegister() const { 782 return HasShaderCyclesRegister; 783 } 784 785 bool hasVOP3Literal() const { 786 return HasVOP3Literal; 787 } 788 789 bool hasNoDataDepHazard() const { 790 return HasNoDataDepHazard; 791 } 792 793 bool vmemWriteNeedsExpWaitcnt() const { 794 return getGeneration() < SEA_ISLANDS; 795 } 796 797 bool hasInstPrefetch() const { return getGeneration() >= GFX10; } 798 799 // Scratch is allocated in 256 dword per wave blocks for the entire 800 // wavefront. When viewed from the perspective of an arbitrary workitem, this 801 // is 4-byte aligned. 802 // 803 // Only 4-byte alignment is really needed to access anything. Transformations 804 // on the pointer value itself may rely on the alignment / known low bits of 805 // the pointer. Set this to something above the minimum to avoid needing 806 // dynamic realignment in common cases. 807 Align getStackAlignment() const { return Align(16); } 808 809 bool enableMachineScheduler() const override { 810 return true; 811 } 812 813 bool useAA() const override; 814 815 bool enableSubRegLiveness() const override { 816 return true; 817 } 818 819 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } 820 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 821 822 // static wrappers 823 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 824 825 // XXX - Why is this here if it isn't in the default pass set? 826 bool enableEarlyIfConversion() const override { 827 return true; 828 } 829 830 void overrideSchedPolicy(MachineSchedPolicy &Policy, 831 unsigned NumRegionInstrs) const override; 832 833 unsigned getMaxNumUserSGPRs() const { 834 return 16; 835 } 836 837 bool hasSMemRealTime() const { 838 return HasSMemRealTime; 839 } 840 841 bool hasMovrel() const { 842 return HasMovrel; 843 } 844 845 bool hasVGPRIndexMode() const { 846 return HasVGPRIndexMode; 847 } 848 849 bool useVGPRIndexMode() const; 850 851 bool hasScalarCompareEq64() const { 852 return getGeneration() >= VOLCANIC_ISLANDS; 853 } 854 855 bool hasScalarStores() const { 856 return HasScalarStores; 857 } 858 859 bool hasScalarAtomics() const { 860 return HasScalarAtomics; 861 } 862 863 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 864 865 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. 866 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 867 868 /// \returns true if the subtarget has the v_permlane64_b32 instruction. 869 bool hasPermLane64() const { return getGeneration() >= GFX11; } 870 871 bool hasDPP() const { 872 return HasDPP; 873 } 874 875 bool hasDPPBroadcasts() const { 876 return HasDPP && getGeneration() < GFX10; 877 } 878 879 bool hasDPPWavefrontShifts() const { 880 return HasDPP && getGeneration() < GFX10; 881 } 882 883 bool hasDPP8() const { 884 return HasDPP8; 885 } 886 887 bool has64BitDPP() const { 888 return Has64BitDPP; 889 } 890 891 bool hasPackedFP32Ops() const { 892 return HasPackedFP32Ops; 893 } 894 895 bool hasFmaakFmamkF32Insts() const { 896 return getGeneration() >= GFX10 || hasGFX940Insts(); 897 } 898 899 bool hasImageInsts() const { 900 return HasImageInsts; 901 } 902 903 bool hasExtendedImageInsts() const { 904 return HasExtendedImageInsts; 905 } 906 907 bool hasR128A16() const { 908 return HasR128A16; 909 } 910 911 bool hasA16() const { return HasA16; } 912 913 bool hasG16() const { return HasG16; } 914 915 bool hasOffset3fBug() const { 916 return HasOffset3fBug; 917 } 918 919 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 920 921 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 922 923 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 924 925 bool hasNSAEncoding() const { return HasNSAEncoding; } 926 927 unsigned getNSAMaxSize() const { return NSAMaxSize; } 928 929 bool hasGFX10_AEncoding() const { 930 return GFX10_AEncoding; 931 } 932 933 bool hasGFX10_BEncoding() const { 934 return GFX10_BEncoding; 935 } 936 937 bool hasGFX10_3Insts() const { 938 return GFX10_3Insts; 939 } 940 941 bool hasMadF16() const; 942 943 bool hasMovB64() const { return GFX940Insts; } 944 945 bool hasLshlAddB64() const { return GFX940Insts; } 946 947 bool enableSIScheduler() const { 948 return EnableSIScheduler; 949 } 950 951 bool loadStoreOptEnabled() const { 952 return EnableLoadStoreOpt; 953 } 954 955 bool hasSGPRInitBug() const { 956 return SGPRInitBug; 957 } 958 959 bool hasUserSGPRInit16Bug() const { 960 return UserSGPRInit16Bug && isWave32(); 961 } 962 963 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 964 965 bool hasNegativeUnalignedScratchOffsetBug() const { 966 return NegativeUnalignedScratchOffsetBug; 967 } 968 969 bool hasMFMAInlineLiteralBug() const { 970 return HasMFMAInlineLiteralBug; 971 } 972 973 bool has12DWordStoreHazard() const { 974 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 975 } 976 977 // \returns true if the subtarget supports DWORDX3 load/store instructions. 978 bool hasDwordx3LoadStores() const { 979 return CIInsts; 980 } 981 982 bool hasReadM0MovRelInterpHazard() const { 983 return getGeneration() == AMDGPUSubtarget::GFX9; 984 } 985 986 bool hasReadM0SendMsgHazard() const { 987 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 988 getGeneration() <= AMDGPUSubtarget::GFX9; 989 } 990 991 bool hasReadM0LdsDmaHazard() const { 992 return getGeneration() == AMDGPUSubtarget::GFX9; 993 } 994 995 bool hasReadM0LdsDirectHazard() const { 996 return getGeneration() == AMDGPUSubtarget::GFX9; 997 } 998 999 bool hasVcmpxPermlaneHazard() const { 1000 return HasVcmpxPermlaneHazard; 1001 } 1002 1003 bool hasVMEMtoScalarWriteHazard() const { 1004 return HasVMEMtoScalarWriteHazard; 1005 } 1006 1007 bool hasSMEMtoVectorWriteHazard() const { 1008 return HasSMEMtoVectorWriteHazard; 1009 } 1010 1011 bool hasLDSMisalignedBug() const { 1012 return LDSMisalignedBug && !EnableCuMode; 1013 } 1014 1015 bool hasInstFwdPrefetchBug() const { 1016 return HasInstFwdPrefetchBug; 1017 } 1018 1019 bool hasVcmpxExecWARHazard() const { 1020 return HasVcmpxExecWARHazard; 1021 } 1022 1023 bool hasLdsBranchVmemWARHazard() const { 1024 return HasLdsBranchVmemWARHazard; 1025 } 1026 1027 // Shift amount of a 64 bit shift cannot be a highest allocated register 1028 // if also at the end of the allocation block. 1029 bool hasShift64HighRegBug() const { 1030 return GFX90AInsts && !GFX940Insts; 1031 } 1032 1033 // Has one cycle hazard on transcendental instruction feeding a 1034 // non transcendental VALU. 1035 bool hasTransForwardingHazard() const { return GFX940Insts; } 1036 1037 // Has one cycle hazard on a VALU instruction partially writing dst with 1038 // a shift of result bits feeding another VALU instruction. 1039 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1040 1041 // Cannot use op_sel with v_dot instructions. 1042 bool hasDOTOpSelHazard() const { return GFX940Insts; } 1043 1044 // Does not have HW interlocs for VALU writing and then reading SGPRs. 1045 bool hasVDecCoExecHazard() const { 1046 return GFX940Insts; 1047 } 1048 1049 bool hasNSAtoVMEMBug() const { 1050 return HasNSAtoVMEMBug; 1051 } 1052 1053 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1054 1055 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1056 1057 bool hasGFX90AInsts() const { return GFX90AInsts; } 1058 1059 bool hasFPAtomicToDenormModeHazard() const { 1060 return getGeneration() == GFX10; 1061 } 1062 1063 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1064 1065 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1066 1067 bool hasVALUPartialForwardingHazard() const { 1068 return getGeneration() >= GFX11; 1069 } 1070 1071 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1072 1073 bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; } 1074 1075 /// Return if operations acting on VGPR tuples require even alignment. 1076 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1077 1078 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. 1079 bool hasSPackHL() const { return GFX11Insts; } 1080 1081 /// Return true if the target's EXP instruction has the COMPR flag, which 1082 /// affects the meaning of the EN (enable) bits. 1083 bool hasCompressedExport() const { return !GFX11Insts; } 1084 1085 /// Return true if the target's EXP instruction supports the NULL export 1086 /// target. 1087 bool hasNullExportTarget() const { return !GFX11Insts; } 1088 1089 bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; } 1090 1091 bool hasVOPDInsts() const { return HasVOPDInsts; } 1092 1093 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1094 1095 /// Return true if the target has the S_DELAY_ALU instruction. 1096 bool hasDelayAlu() const { return GFX11Insts; } 1097 1098 bool hasPackedTID() const { return HasPackedTID; } 1099 1100 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1101 // hasGFX90AInsts is also true. 1102 bool hasGFX940Insts() const { return GFX940Insts; } 1103 1104 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1105 /// SGPRs 1106 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1107 1108 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1109 /// VGPRs 1110 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1111 1112 /// Return occupancy for the given function. Used LDS and a number of 1113 /// registers if provided. 1114 /// Note, occupancy can be affected by the scratch allocation as well, but 1115 /// we do not have enough information to compute it. 1116 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1117 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1118 1119 /// \returns true if the flat_scratch register should be initialized with the 1120 /// pointer to the wave's scratch memory rather than a size and offset. 1121 bool flatScratchIsPointer() const { 1122 return getGeneration() >= AMDGPUSubtarget::GFX9; 1123 } 1124 1125 /// \returns true if the flat_scratch register is initialized by the HW. 1126 /// In this case it is readonly. 1127 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1128 1129 /// \returns true if the machine has merged shaders in which s0-s7 are 1130 /// reserved by the hardware and user SGPRs start at s8 1131 bool hasMergedShaders() const { 1132 return getGeneration() >= GFX9; 1133 } 1134 1135 // \returns true if the target supports the pre-NGG legacy geometry path. 1136 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1137 1138 /// \returns SGPR allocation granularity supported by the subtarget. 1139 unsigned getSGPRAllocGranule() const { 1140 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1141 } 1142 1143 /// \returns SGPR encoding granularity supported by the subtarget. 1144 unsigned getSGPREncodingGranule() const { 1145 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1146 } 1147 1148 /// \returns Total number of SGPRs supported by the subtarget. 1149 unsigned getTotalNumSGPRs() const { 1150 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1151 } 1152 1153 /// \returns Addressable number of SGPRs supported by the subtarget. 1154 unsigned getAddressableNumSGPRs() const { 1155 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1156 } 1157 1158 /// \returns Minimum number of SGPRs that meets the given number of waves per 1159 /// execution unit requirement supported by the subtarget. 1160 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1161 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1162 } 1163 1164 /// \returns Maximum number of SGPRs that meets the given number of waves per 1165 /// execution unit requirement supported by the subtarget. 1166 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1167 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1168 } 1169 1170 /// \returns Reserved number of SGPRs. This is common 1171 /// utility function called by MachineFunction and 1172 /// Function variants of getReservedNumSGPRs. 1173 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1174 /// \returns Reserved number of SGPRs for given machine function \p MF. 1175 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1176 1177 /// \returns Reserved number of SGPRs for given function \p F. 1178 unsigned getReservedNumSGPRs(const Function &F) const; 1179 1180 /// \returns max num SGPRs. This is the common utility 1181 /// function called by MachineFunction and Function 1182 /// variants of getMaxNumSGPRs. 1183 unsigned getBaseMaxNumSGPRs(const Function &F, 1184 std::pair<unsigned, unsigned> WavesPerEU, 1185 unsigned PreloadedSGPRs, 1186 unsigned ReservedNumSGPRs) const; 1187 1188 /// \returns Maximum number of SGPRs that meets number of waves per execution 1189 /// unit requirement for function \p MF, or number of SGPRs explicitly 1190 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1191 /// 1192 /// \returns Value that meets number of waves per execution unit requirement 1193 /// if explicitly requested value cannot be converted to integer, violates 1194 /// subtarget's specifications, or does not meet number of waves per execution 1195 /// unit requirement. 1196 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1197 1198 /// \returns Maximum number of SGPRs that meets number of waves per execution 1199 /// unit requirement for function \p F, or number of SGPRs explicitly 1200 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1201 /// 1202 /// \returns Value that meets number of waves per execution unit requirement 1203 /// if explicitly requested value cannot be converted to integer, violates 1204 /// subtarget's specifications, or does not meet number of waves per execution 1205 /// unit requirement. 1206 unsigned getMaxNumSGPRs(const Function &F) const; 1207 1208 /// \returns VGPR allocation granularity supported by the subtarget. 1209 unsigned getVGPRAllocGranule() const { 1210 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1211 } 1212 1213 /// \returns VGPR encoding granularity supported by the subtarget. 1214 unsigned getVGPREncodingGranule() const { 1215 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1216 } 1217 1218 /// \returns Total number of VGPRs supported by the subtarget. 1219 unsigned getTotalNumVGPRs() const { 1220 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1221 } 1222 1223 /// \returns Addressable number of VGPRs supported by the subtarget. 1224 unsigned getAddressableNumVGPRs() const { 1225 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1226 } 1227 1228 /// \returns the minimum number of VGPRs that will prevent achieving more than 1229 /// the specified number of waves \p WavesPerEU. 1230 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1231 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1232 } 1233 1234 /// \returns the maximum number of VGPRs that can be used and still achieved 1235 /// at least the specified number of waves \p WavesPerEU. 1236 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1237 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1238 } 1239 1240 /// \returns max num VGPRs. This is the common utility function 1241 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1242 unsigned getBaseMaxNumVGPRs(const Function &F, 1243 std::pair<unsigned, unsigned> WavesPerEU) const; 1244 /// \returns Maximum number of VGPRs that meets number of waves per execution 1245 /// unit requirement for function \p F, or number of VGPRs explicitly 1246 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1247 /// 1248 /// \returns Value that meets number of waves per execution unit requirement 1249 /// if explicitly requested value cannot be converted to integer, violates 1250 /// subtarget's specifications, or does not meet number of waves per execution 1251 /// unit requirement. 1252 unsigned getMaxNumVGPRs(const Function &F) const; 1253 1254 unsigned getMaxNumAGPRs(const Function &F) const { 1255 return getMaxNumVGPRs(F); 1256 } 1257 1258 /// \returns Maximum number of VGPRs that meets number of waves per execution 1259 /// unit requirement for function \p MF, or number of VGPRs explicitly 1260 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1261 /// 1262 /// \returns Value that meets number of waves per execution unit requirement 1263 /// if explicitly requested value cannot be converted to integer, violates 1264 /// subtarget's specifications, or does not meet number of waves per execution 1265 /// unit requirement. 1266 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1267 1268 void getPostRAMutations( 1269 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1270 const override; 1271 1272 std::unique_ptr<ScheduleDAGMutation> 1273 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1274 1275 bool isWave32() const { 1276 return getWavefrontSize() == 32; 1277 } 1278 1279 bool isWave64() const { 1280 return getWavefrontSize() == 64; 1281 } 1282 1283 const TargetRegisterClass *getBoolRC() const { 1284 return getRegisterInfo()->getBoolRC(); 1285 } 1286 1287 /// \returns Maximum number of work groups per compute unit supported by the 1288 /// subtarget and limited by given \p FlatWorkGroupSize. 1289 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1290 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1291 } 1292 1293 /// \returns Minimum flat work group size supported by the subtarget. 1294 unsigned getMinFlatWorkGroupSize() const override { 1295 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1296 } 1297 1298 /// \returns Maximum flat work group size supported by the subtarget. 1299 unsigned getMaxFlatWorkGroupSize() const override { 1300 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1301 } 1302 1303 /// \returns Number of waves per execution unit required to support the given 1304 /// \p FlatWorkGroupSize. 1305 unsigned 1306 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1307 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1308 } 1309 1310 /// \returns Minimum number of waves per execution unit supported by the 1311 /// subtarget. 1312 unsigned getMinWavesPerEU() const override { 1313 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1314 } 1315 1316 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1317 SDep &Dep) const override; 1318 1319 // \returns true if it's beneficial on this subtarget for the scheduler to 1320 // cluster stores as well as loads. 1321 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1322 1323 // \returns the number of address arguments from which to enable MIMG NSA 1324 // on supported architectures. 1325 unsigned getNSAThreshold(const MachineFunction &MF) const; 1326 }; 1327 1328 } // end namespace llvm 1329 1330 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1331